diff --git a/.buildkite/hardware_tests/cpu.yaml b/.buildkite/hardware_tests/cpu.yaml
index ba8fd497c39e..92b0196ec2a2 100644
--- a/.buildkite/hardware_tests/cpu.yaml
+++ b/.buildkite/hardware_tests/cpu.yaml
@@ -12,15 +12,19 @@ steps:
   - vllm/_custom_ops.py
   - tests/kernels/attention/test_cpu_attn.py
   - tests/kernels/moe/test_cpu_fused_moe.py
+  - tests/kernels/moe/test_cpu_fp8_fused_moe.py
   - tests/kernels/test_onednn.py
   - tests/kernels/test_awq_int4_to_int8.py
+  - tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
   commands:
     - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 20m "
+      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
       pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
       pytest -x -v -s tests/kernels/moe/test_cpu_fused_moe.py
+      pytest -x -v -s tests/kernels/moe/test_cpu_fp8_fused_moe.py
       pytest -x -v -s tests/kernels/test_onednn.py
-      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py"
+      pytest -x -v -s tests/kernels/test_awq_int4_to_int8.py
+      pytest -x -v -s tests/kernels/quantization/test_cpu_fp8_scaled_mm.py"
 
 - label: CPU-Compatibility Tests
   depends_on: []
@@ -61,6 +65,7 @@ steps:
   - vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
   - vllm/model_executor/layers/quantization/kernels/scaled_mm/cpu.py
   - vllm/model_executor/layers/quantization/kernels/mixed_precision/cpu.py
+  - vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
   - tests/quantization/test_compressed_tensors.py
   - tests/quantization/test_cpu_wna16.py
   commands:
diff --git a/.buildkite/intel_jobs/lora_intel.yaml b/.buildkite/intel_jobs/lora_intel.yaml
index 366d9daa24dd..6d5bddacf1bc 100644
--- a/.buildkite/intel_jobs/lora_intel.yaml
+++ b/.buildkite/intel_jobs/lora_intel.yaml
@@ -18,17 +18,18 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_layers.py &&
       pytest -v -s lora/test_lora_checkpoints.py &&
-      (pytest -v -s lora/test_lora_functions.py --deselect="tests/lora/test_lora_functions.py::test_lora_functions_sync" --deselect="tests/lora/test_lora_functions.py::test_lora_functions_async" || true) &&
+      pytest -v -s lora/test_lora_functions.py &&
       pytest -v -s lora/test_lora_huggingface.py &&
       pytest -v -s lora/test_lora_manager.py &&
       pytest -v -s lora/test_lora_utils.py &&
       pytest -v -s lora/test_peft_helper.py &&
       pytest -v -s lora/test_resolver.py &&
       pytest -v -s lora/test_utils.py &&
-      (pytest -v -s lora/test_add_lora.py --deselect="tests/lora/test_add_lora.py::test_add_lora" || true) &&
-      (pytest -v -s lora/test_worker.py --deselect="tests/lora/test_worker.py::test_worker_apply_lora" || true)'
+      pytest -v -s lora/test_add_lora.py  &&
+      pytest -v -s lora/test_worker.py'
 
 - label: LoRA Fused/MoE Kernels
   timeout_in_minutes: 45
@@ -46,6 +47,7 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_fused_moe_lora_kernel.py && 
       pytest -v -s lora/test_moe_lora_align_sum.py'
 
@@ -65,8 +67,9 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       set -o pipefail &&
-      pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-2-2049-64-32-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-2-64000-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype1-1-102656-32-4-4]"'
+      pytest -v -s lora/test_punica_ops.py --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-3-43264-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype1-1-2049-64-128-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-1-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-1-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-256-8-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype0-3-2049-128-8-16]" --deselect="tests/lora/test_punica_ops.py::test_kernels[shrink-0-xpu:0-dtype0-1-2049-128-8-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels[expand-0-xpu:0-dtype1-1-2049-256-128-32]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-3-64256-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-2-29696-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype1-3-49408-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[shrink-0-xpu:0-dtype0-2-16384-32-4-4]" --deselect="tests/lora/test_punica_ops.py::test_kernels_hidden_size[expand-0-xpu:0-dtype0-2-51328-32-4-4]"'
 
 - label: LoRA Punica FP8/XPU Ops
   timeout_in_minutes: 45
@@ -84,6 +87,7 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_punica_ops_fp8.py &&
       pytest -v -s lora/test_punica_xpu_ops.py'
 
@@ -103,10 +107,12 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       (pytest -v -s lora/test_mixtral.py --deselect="tests/lora/test_mixtral.py::test_mixtral_lora[4]" || true) &&
       pytest -v -s lora/test_quant_model.py --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model0]" --deselect="tests/lora/test_quant_model.py::test_quant_model_lora[model1]" --deselect="tests/lora/test_quant_model.py::test_quant_model_tp_equality[model0]" &&
-      pytest -v -s lora/test_qwen35_densemodel_lora.py &&
-      pytest -v -s lora/test_transformers_model.py'
+      pytest -v -s lora/test_transformers_model.py &&
+      pytest -v -s lora/test_chatglm3_tp.py &&
+      pytest -s -v lora/test_minicpmv_tp.py'
 
 - label: LoRA Multimodal
   timeout_in_minutes: 45
@@ -124,6 +130,6 @@ steps:
     - >-
       bash .buildkite/scripts/hardware_ci/run-intel-test.sh
       'cd tests &&
+      export VLLM_WORKER_MULTIPROC_METHOD=spawn &&
       pytest -v -s lora/test_default_mm_loras.py && 
-      (pytest -v -s lora/test_qwen3_unembed.py || true) &&
       pytest -v -s lora/test_whisper.py'
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 74227da45c71..8a900c0bf862 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -37,7 +37,7 @@ steps:
         agents:
           queue: arm64_cpu_queue_release
         commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04  --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_AARCH64}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinuxaarch64-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
           - "bash .buildkite/scripts/upload-nightly-wheels.sh"
@@ -76,7 +76,7 @@ steps:
         agents:
           queue: cpu_queue_release
         commands:
-          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.2-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+          - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.2 --build-arg torch_cuda_arch_list=\"${CUDA_ARCH_X86}\" --build-arg BUILD_OS=manylinux --build-arg BUILD_BASE_IMAGE=pytorch/manylinux2_28-builder:cuda13.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
           - "mkdir artifacts"
           - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
           - "bash .buildkite/scripts/upload-nightly-wheels.sh"
@@ -309,6 +309,7 @@ steps:
         depends_on: ~
 
       - label: "Build release image - x86_64 - CPU"
+        key: build-cpu-release-image-x86
         depends_on:
           - block-cpu-release-image-build
           - input-release-version
@@ -327,7 +328,8 @@ steps:
         depends_on: ~
 
       - label: "Build release image - arm64 - CPU"
-        depends_on: 
+        key: build-cpu-release-image-arm64
+        depends_on:
           - block-arm64-cpu-release-image-build
           - input-release-version
         agents:
@@ -436,6 +438,41 @@ steps:
           DOCKER_BUILDKIT: "1"
           DOCKERHUB_USERNAME: "vllmbot"
 
+      - block: "Publish release images to DockerHub"
+        key: block-publish-release-images
+        depends_on:
+          - create-multi-arch-manifest
+          - create-multi-arch-manifest-cuda-12-9
+          - create-multi-arch-manifest-ubuntu2404
+          - create-multi-arch-manifest-cuda-12-9-ubuntu2404
+          - build-rocm-release-image
+          - input-release-version
+          # Wait for CPU builds if their block steps were unblocked, so publish
+          # doesn't race the in-progress CPU build. allow_failure lets publish
+          # proceed when the operator legitimately leaves the CPU block steps
+          # unblocked or the CPU build fails.
+          - step: build-cpu-release-image-x86
+            allow_failure: true
+          - step: build-cpu-release-image-arm64
+            allow_failure: true
+        if: build.env("NIGHTLY") != "1"
+
+      - label: "Publish release images to DockerHub"
+        depends_on:
+          - block-publish-release-images
+        key: publish-release-images-dockerhub
+        agents:
+          queue: small_cpu_queue_release
+        commands:
+          - "bash .buildkite/scripts/publish-release-images.sh"
+        plugins:
+          - docker-login#v3.0.0:
+              username: vllmbot
+              password-env: DOCKERHUB_TOKEN
+        env:
+          DOCKER_BUILDKIT: "1"
+          DOCKERHUB_USERNAME: "vllmbot"
+
   - group: "Publish wheels"
     key: "publish-wheels"
     steps:
@@ -723,7 +760,7 @@ steps:
       - "bash tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
     env:
       S3_BUCKET: "vllm-wheels"
-      VARIANT: "rocm721"
+      VARIANT: "rocm722"
 
   # ROCm Job 6: Build ROCm Release Docker Image
   - label: ":docker: Build release image - x86_64 - ROCm"
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 6f41d1cdda47..afa884fba46b 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -8,8 +8,6 @@ if [ -z "${RELEASE_VERSION}" ]; then
   RELEASE_VERSION="1.0.0.dev"
 fi
 
-ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
-
 buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel (by commit):
 \`\`\`
@@ -25,95 +23,5 @@ aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-
 aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_aarch64.whl .
 \`\`\`
 
-
-To download and upload the image:
-
-\`\`\`
-# Download images:
-
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
-docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
-docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
-
-# Tag and push images:
-
-## CUDA
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai:latest-x86_64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
-docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-docker push vllm/vllm-openai:latest-x86_64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
-docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker push vllm/vllm-openai:latest-aarch64
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker push vllm/vllm-openai:latest-aarch64-cu129
-docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-
-## ROCm
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:latest
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-docker push vllm/vllm-openai-rocm:latest
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
-docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-docker push vllm/vllm-openai-rocm:latest-base
-docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
-
-## CPU
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:latest-x86_64
-docker tag vllm/vllm-openai-cpu:x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-docker push vllm/vllm-openai-cpu:latest-x86_64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
-
-docker tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} vllm/vllm-openai-cpu:arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:latest-arm64
-docker tag vllm/vllm-openai-cpu:arm64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker push vllm/vllm-openai-cpu:latest-arm64
-docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-
-# Create multi-arch manifest:
-
-docker manifest rm vllm/vllm-openai:latest
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
-docker manifest push vllm/vllm-openai:latest
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
-
-docker manifest rm vllm/vllm-openai:latest-cu129
-docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
-docker manifest push vllm/vllm-openai:latest-cu129
-docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
-
-docker manifest rm vllm/vllm-openai-cpu:latest || true
-docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
-docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
-docker manifest push vllm/vllm-openai-cpu:latest
-docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
-\`\`\`
+Docker images are published automatically by the "Publish release images to DockerHub" pipeline step.
 EOF
diff --git a/.buildkite/scripts/ci-fetch-log.sh b/.buildkite/scripts/ci-fetch-log.sh
new file mode 100755
index 000000000000..02798b56f4a9
--- /dev/null
+++ b/.buildkite/scripts/ci-fetch-log.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Usage: ./ci-fetch-log.sh <buildkite_job_url> [output_file]
+#        ./ci-fetch-log.sh <build_number> <job_uuid> [output_file]
+#
+# Downloads the raw log for a Buildkite job from the public, unauthenticated
+# /organizations/<org>/pipelines/<pipeline>/builds/<n>/jobs/<uuid>/download
+# endpoint, then strips ANSI/timestamps via ci-clean-log.sh.
+#
+# Find <build_number> and <job_uuid> via:
+#   gh pr checks <PR> --repo vllm-project/vllm
+# Each failing row's URL is .../builds/<build_number>#<job_uuid>.
+
+set -euo pipefail
+
+ORG="vllm"
+PIPELINE="ci"
+
+usage() {
+    echo "Usage: $0 <buildkite_job_url> [output_file]"
+    echo "       $0 <build_number> <job_uuid> [output_file]"
+    exit 1
+}
+
+if [ $# -lt 1 ]; then usage; fi
+
+if [[ "$1" == https://* ]]; then
+    BUILD=$(echo "$1" | sed -nE 's#.*/builds/([0-9]+).*#\1#p')
+    JOB=$(echo "$1" | grep -oE '[0-9a-f]{8}-[0-9a-f-]+' | head -n 1)
+    OUT="${2:-ci-${BUILD}-${JOB:0:8}.log}"
+else
+    if [ $# -lt 2 ]; then usage; fi
+    BUILD="$1"
+    JOB="$2"
+    OUT="${3:-ci-${BUILD}-${JOB:0:8}.log}"
+fi
+
+if [ -z "$BUILD" ] || [ -z "$JOB" ]; then
+    echo "Could not parse build number or job UUID from: $1" >&2
+    usage
+fi
+
+COOKIES=$(mktemp)
+trap 'rm -f "$COOKIES"' EXIT
+
+# Buildkite issues a session cookie on first hit; subsequent /download needs it.
+curl -fsSL -c "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/${ORG}/${PIPELINE}/builds/${BUILD}" -o /dev/null
+
+curl -fsSL -b "$COOKIES" -A "vllm-ci-fetch-log" \
+    "https://buildkite.com/organizations/${ORG}/pipelines/${PIPELINE}/builds/${BUILD}/jobs/${JOB}/download" \
+    -o "$OUT"
+
+bash "$(dirname "$0")/ci-clean-log.sh" "$OUT"
+
+echo "$OUT"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 703a7d753220..7e8ddb12ec98 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -378,9 +378,11 @@ HF_MOUNT="/root/.cache/huggingface"
 # double-quotes will have been stripped by the calling shell.
 if [[ -n "${VLLM_TEST_COMMANDS:-}" ]]; then
   commands="${VLLM_TEST_COMMANDS}"
+  commands_source="env"
   echo "Commands sourced from VLLM_TEST_COMMANDS (quoting preserved)"
 else
   commands="$*"
+  commands_source="argv"
   if [[ -z "$commands" ]]; then
     echo "Error: No test commands provided." >&2
     echo "Usage:" >&2
@@ -397,9 +399,15 @@ fi
 
 echo "Raw commands: $commands"
 
-# Fix quoting before ROCm overrides (so overrides see correct structure)
-commands=$(re_quote_pytest_markers "$commands")
-echo "After re-quoting: $commands"
+# Only try to repair stripped pytest -m/-k quoting in legacy argv mode.
+# VLLM_TEST_COMMANDS preserves inner quoting already, and re-quoting that path
+# can corrupt embedded echo strings or otherwise well-formed shell fragments.
+if [[ "$commands_source" == "argv" ]]; then
+  commands=$(re_quote_pytest_markers "$commands")
+  echo "After re-quoting: $commands"
+else
+  echo "Skipping re-quoting for VLLM_TEST_COMMANDS input"
+fi
 
 commands=$(apply_rocm_test_overrides "$commands")
 echo "Final commands: $commands"
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
index 7166435ac1e9..9c13fa79fcb2 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -31,6 +31,21 @@ function cpu_tests() {
     set -e
     pip list"
 
+  # Run kernel tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
+    pytest -x -v -s tests/kernels/core/test_cpu_activation.py
+    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
+
+  # skip tests requiring model downloads if HF_TOKEN is not set
+  # due to rate-limits
+  if [ -z "$HF_TOKEN" ]; then
+    echo "Warning: HF_TOKEN is not set. Skipping tests that require model downloads."
+    return
+  fi
+
   # offline inference
   docker exec cpu-test bash -c "
     set -e
@@ -46,13 +61,6 @@ function cpu_tests() {
     set -e
     pytest -x -v -s tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
 
-  # Run kernel tests
-  docker exec cpu-test bash -c "
-    set -e
-    pytest -x -v -s tests/kernels/test_onednn.py
-    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py
-    pytest -x -v -s tests/kernels/core/test_cpu_activation.py
-    pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic"
 
   # basic online serving
   docker exec cpu-test bash -c '
@@ -67,6 +75,21 @@ function cpu_tests() {
       --num-prompts 20 \
       --endpoint /v1/completions
     kill -s SIGTERM $server_pid &'
+
+  # smoke test for Gated DeltaNet
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3.5-0.8B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3.5-0.8B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
index feaf2b356267..61ebddf82e40 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@@ -136,8 +136,6 @@ run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
     "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 run_and_track_test 4 "test_quantization_accuracy.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
-run_and_track_test 5 "examples/offline_inference/tpu.py" \
-    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 run_and_track_test 6 "test_tpu_model_runner.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 run_and_track_test 7 "test_sampler.py" \
diff --git a/.buildkite/scripts/publish-release-images.sh b/.buildkite/scripts/publish-release-images.sh
new file mode 100755
index 000000000000..ec319aa76006
--- /dev/null
+++ b/.buildkite/scripts/publish-release-images.sh
@@ -0,0 +1,180 @@
+#!/bin/bash
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Publish release Docker images from ECR to DockerHub.
+# Pulls per-arch images, tags with latest and versioned tags, pushes them,
+# then creates and pushes multi-arch manifests.
+
+set -euo pipefail
+
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version --default "" | sed 's/^v//')
+if [ -z "${RELEASE_VERSION}" ]; then
+  echo "ERROR: release-version metadata not set"
+  exit 1
+fi
+
+COMMIT="$BUILDKITE_COMMIT"
+ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
+
+echo "========================================"
+echo "Publishing release images v${RELEASE_VERSION}"
+echo "  Commit: ${COMMIT}"
+echo "  ROCm base cache key: ${ROCM_BASE_CACHE_KEY}"
+echo "========================================"
+
+# Login to ECR to pull staging images
+aws ecr-public get-login-password --region us-east-1 | \
+  docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
+
+# ---- CUDA (default: 13.0) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:latest-x86_64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:latest-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker push vllm/vllm-openai:latest-aarch64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+
+docker manifest rm vllm/vllm-openai:latest || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION} || true
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
+docker manifest push vllm/vllm-openai:latest
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
+
+# ---- CUDA 12.9 ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
+docker push vllm/vllm-openai:latest-x86_64-cu129
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+docker push vllm/vllm-openai:latest-aarch64-cu129
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+
+docker manifest rm vllm/vllm-openai:latest-cu129 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129 || true
+docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
+docker manifest push vllm/vllm-openai:latest-cu129
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
+
+# ---- Ubuntu 24.04 (CUDA 13.0) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404
+docker push vllm/vllm-openai:latest-x86_64-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+docker push vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+
+docker manifest rm vllm/vllm-openai:latest-ubuntu2404 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 || true
+docker manifest create vllm/vllm-openai:latest-ubuntu2404 vllm/vllm-openai:latest-x86_64-ubuntu2404 vllm/vllm-openai:latest-aarch64-ubuntu2404
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-ubuntu2404
+docker manifest push vllm/vllm-openai:latest-ubuntu2404
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-ubuntu2404
+
+# ---- Ubuntu 24.04 (CUDA 12.9) ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404
+docker push vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-aarch64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+docker push vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+
+docker manifest rm vllm/vllm-openai:latest-cu129-ubuntu2404 || true
+docker manifest rm vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 || true
+docker manifest create vllm/vllm-openai:latest-cu129-ubuntu2404 vllm/vllm-openai:latest-x86_64-cu129-ubuntu2404 vllm/vllm-openai:latest-aarch64-cu129-ubuntu2404
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129-ubuntu2404 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129-ubuntu2404
+docker manifest push vllm/vllm-openai:latest-cu129-ubuntu2404
+docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129-ubuntu2404
+
+# ---- ROCm ----
+
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:latest
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${COMMIT}-rocm vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+docker push vllm/vllm-openai-rocm:latest
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
+
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:latest-base
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+docker push vllm/vllm-openai-rocm:latest-base
+docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
+
+# ---- CPU ----
+# CPU images are behind separate block steps and may not have been built.
+# All-or-nothing: inspect both arches first, then either publish everything
+# (per-arch + multi-arch manifest) or skip everything. Publishing only one
+# arch would leave `:latest-x86_64` pointing at the new release while the
+# `:latest` multi-arch manifest still resolves to the previous release.
+
+CPU_X86_TAG=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
+CPU_ARM_TAG=public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
+
+CPU_X86_AVAILABLE=false
+CPU_ARM_AVAILABLE=false
+docker manifest inspect "${CPU_X86_TAG}" >/dev/null 2>&1 && CPU_X86_AVAILABLE=true
+docker manifest inspect "${CPU_ARM_TAG}" >/dev/null 2>&1 && CPU_ARM_AVAILABLE=true
+
+if [ "$CPU_X86_AVAILABLE" = "true" ] && [ "$CPU_ARM_AVAILABLE" = "true" ]; then
+  docker pull "${CPU_X86_TAG}"
+  docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:latest-x86_64
+  docker tag "${CPU_X86_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+  docker push vllm/vllm-openai-cpu:latest-x86_64
+  docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64
+
+  docker pull "${CPU_ARM_TAG}"
+  docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:latest-arm64
+  docker tag "${CPU_ARM_TAG}" vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+  docker push vllm/vllm-openai-cpu:latest-arm64
+  docker push vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+
+  docker manifest rm vllm/vllm-openai-cpu:latest || true
+  docker manifest rm vllm/vllm-openai-cpu:v${RELEASE_VERSION} || true
+  docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
+  docker manifest create vllm/vllm-openai-cpu:v${RELEASE_VERSION} vllm/vllm-openai-cpu:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai-cpu:v${RELEASE_VERSION}-arm64
+  docker manifest push vllm/vllm-openai-cpu:latest
+  docker manifest push vllm/vllm-openai-cpu:v${RELEASE_VERSION}
+elif [ "$CPU_X86_AVAILABLE" = "false" ] && [ "$CPU_ARM_AVAILABLE" = "false" ]; then
+  echo "WARNING: Neither CPU image found in ECR, skipping CPU publish (ensure block-cpu-release-image-build and block-arm64-cpu-release-image-build were unblocked and the builds finished pushing)"
+else
+  # Partial state: one arch built, the other did not. Fail loudly rather than
+  # ship a Docker Hub state where `:latest-${arch}` and `:latest` (multi-arch)
+  # disagree on which release they point at.
+  echo "ERROR: Partial CPU build detected (x86_64=${CPU_X86_AVAILABLE}, arm64=${CPU_ARM_AVAILABLE})."
+  echo "       Refusing to publish to avoid split-tag drift between per-arch and multi-arch tags."
+  echo "       Re-run the missing CPU build and retry, or manually publish if a single-arch release is intended."
+  exit 1
+fi
+
+echo ""
+echo "Successfully published release images for v${RELEASE_VERSION}"
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
index de48eb282a65..0eadfa1f80b4 100755
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh
@@ -51,6 +51,7 @@ vllm serve "$MODEL" \
   --offload-num-in-group 2 \
   --offload-prefetch-step 1 \
   --offload-params w13_weight w2_weight \
+  --generation-config vllm \
   --port "$PORT" \
   ${EXTRA_ARGS+"${EXTRA_ARGS[@]}"} &
 SERVER_PID=$!
diff --git a/.buildkite/scripts/upload-release-wheels-pypi.sh b/.buildkite/scripts/upload-release-wheels-pypi.sh
index 058e5bbe4f4c..7e2077a2692c 100644
--- a/.buildkite/scripts/upload-release-wheels-pypi.sh
+++ b/.buildkite/scripts/upload-release-wheels-pypi.sh
@@ -39,10 +39,11 @@ fi
 
 set -x # avoid printing secrets above
 
-# install twine from pypi
+# install twine and sdist build prerequisites from pypi
 python3 -m venv /tmp/vllm-release-env
 source /tmp/vllm-release-env/bin/activate
 pip install twine
+pip install -r requirements/build/cuda.txt
 python3 -m twine --version
 
 # copy release wheels to local directory
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 844dbe639b3c..bad2796266a7 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -230,7 +230,6 @@ steps:
   - tests/entrypoints/llm/test_collective_rpc.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -272,7 +271,6 @@ steps:
   - tests/v1/worker/test_worker_memory_snapshot.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
@@ -396,8 +394,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
@@ -590,7 +588,6 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - pip freeze | grep -E 'torch'
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Multi-Modal Models (Extended Generation 2) # TBD
@@ -621,6 +618,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
   agent_pool: mi250_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -864,7 +862,6 @@ steps:
   - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -930,6 +927,7 @@ steps:
   - tests/renderers
   - tests/standalone_tests/lazy_imports.py
   - tests/tokenizers_
+  - tests/reasoning
   - tests/tool_parsers
   - tests/transformers_utils
   - tests/config
@@ -942,7 +940,7 @@ steps:
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s renderers
   - pytest -v -s tokenizers_
-  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
+  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
@@ -1173,7 +1171,6 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s tests/distributed/test_context_parallel.py
   - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
 
@@ -1187,7 +1184,6 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s distributed/test_custom_all_reduce.py
   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
@@ -1207,7 +1203,6 @@ steps:
   - tests/examples/features/data_parallel/data_parallel_offline.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
@@ -1253,7 +1248,6 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s distributed/test_ray_v2_executor.py
   - pytest -v -s distributed/test_ray_v2_executor_e2e.py
   - pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
@@ -1275,7 +1269,6 @@ steps:
   - vllm/v1/worker/gpu_worker.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - torchrun --nproc-per-node=8 ../examples/features/torchrun/torchrun_dp_example_offline.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 
 #--------------------------------------------------------  mi300 · entrypoints  --------------------------------------------------------#
@@ -1656,8 +1649,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # Features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
@@ -1803,6 +1796,7 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -1844,6 +1838,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2204,7 +2199,6 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi300]
   agent_pool: mi300_1
-  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2281,7 +2275,6 @@ steps:
   - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -2301,7 +2294,6 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/rl/rlhf_async_new_apis.py
   - VLLM_LOGGING_LEVEL=DEBUG python3 examples/features/data_parallel/data_parallel_offline.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
   - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -2364,7 +2356,6 @@ steps:
   - tests/distributed/test_utils
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
@@ -2494,7 +2485,6 @@ steps:
   - tests/entrypoints/llm/test_collective_rpc.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
@@ -2519,7 +2509,6 @@ steps:
   - tests/v1/worker/test_worker_memory_snapshot.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
@@ -2540,7 +2529,6 @@ steps:
   - tests/distributed/test_multiproc_executor.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
@@ -2628,6 +2616,7 @@ steps:
   agent_pool: mi325_1
   torch_nightly: true
   parallelism: 2
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2653,6 +2642,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_1
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2718,7 +2708,6 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s tests/distributed/test_context_parallel.py
   - pytest -v -s tests/v1/distributed/test_dbo.py
 
@@ -2749,6 +2738,7 @@ steps:
   agent_pool: mi355_1
   fast_check: true
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2764,6 +2754,7 @@ steps:
   agent_pool: mi355_1
   fast_check: true
   torch_nightly: true
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2939,8 +2930,8 @@ steps:
   - python3 pooling/embed/vision_embedding_offline.py --seed 0
   # Features demo
   - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-  - python3 offline_inference/llm_engine_example.py
-  - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+  - python3 deployment/llm_engine_example.py
+  - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
   - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
   - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 
@@ -3060,6 +3051,7 @@ steps:
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -3259,6 +3251,7 @@ steps:
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -3284,6 +3277,7 @@ steps:
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
   agent_pool: mi355_1
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml
index 9b1f67705d6e..d3947a03162b 100644
--- a/.buildkite/test_areas/attention.yaml
+++ b/.buildkite/test_areas/attention.yaml
@@ -17,7 +17,7 @@ steps:
 - label: V1 attention (B200)
   key: v1-attention-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
     - vllm/config/attention.py
     - vllm/model_executor/layers/attention
diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml
index 3385f9472ae6..85f804780179 100644
--- a/.buildkite/test_areas/benchmarks.yaml
+++ b/.buildkite/test_areas/benchmarks.yaml
@@ -14,7 +14,7 @@ steps:
 
 - label: Attention Benchmarks Smoke Test (B200)
   key: attention-benchmarks-smoke-test-b200
-  device: b200
+  device: b200-k8s
   num_gpus: 2
   optional: true
   working_dir: "/vllm-workspace/"
diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml
index 638e5b0eec9b..01248738d519 100644
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -43,7 +43,7 @@ steps:
   key: asynctp-correctness-tests-b200
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   commands:
@@ -68,7 +68,7 @@ steps:
   key: fusion-and-compile-unit-tests-2xb200
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/
@@ -137,7 +137,7 @@ steps:
   key: fusion-e2e-config-sweep-b200
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   num_devices: 1
   optional: true
   commands:
@@ -209,7 +209,7 @@ steps:
   key: fusion-e2e-tp2-b200
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
diff --git a/.buildkite/test_areas/disaggregated.yaml b/.buildkite/test_areas/disaggregated.yaml
index cef2c69d775a..e68b9e1add8b 100644
--- a/.buildkite/test_areas/disaggregated.yaml
+++ b/.buildkite/test_areas/disaggregated.yaml
@@ -8,7 +8,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
@@ -19,7 +19,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
@@ -31,7 +31,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
@@ -43,7 +43,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
@@ -55,7 +55,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 4
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - tests/v1/kv_connector/nixl_integration/
   commands:
     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
@@ -67,7 +67,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading/
@@ -83,7 +83,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/v1/worker/kv_connector_model_runner_mixin.py
     - tests/v1/kv_connector/nixl_integration/
   commands:
@@ -96,7 +96,7 @@ steps:
   working_dir: "/vllm-workspace/tests"
   num_devices: 2
   source_file_dependencies:
-    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+    - vllm/distributed/kv_transfer/kv_connector/v1/nixl/
     - vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
     - vllm/distributed/kv_transfer/kv_connector/v1/offloading/
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 6f45a38eeb39..8aa41a9a26ab 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -212,7 +212,7 @@ steps:
 
 - label: Distributed Tests (2 GPUs)(B200)
   key: distributed-tests-2-gpus-b200
-  device: b200
+  device: b200-k8s
   optional: true
   working_dir: "/vllm-workspace/"
   num_devices: 2
diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml
index 8df3b2fd0d9e..bb8aa14eac18 100644
--- a/.buildkite/test_areas/e2e_integration.yaml
+++ b/.buildkite/test_areas/e2e_integration.yaml
@@ -25,7 +25,7 @@ steps:
 - label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
   key: qwen3-30b-a3b-fp8-block-accuracy-b200
   timeout_in_minutes: 60
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   working_dir: "/vllm-workspace"
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index 17ac44803aaf..cf0f028255d2 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -13,8 +13,9 @@ steps:
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
+  - tests/test_jit_monitor.py
   commands:
-  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py test_jit_monitor.py
 
 - label: Engine (1 GPU)
   key: engine-1-gpu
@@ -56,11 +57,6 @@ steps:
   commands:
     # Only run tests that need exactly 2 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
-  mirror:
-    amd:
-      device: mi325_2
-      depends_on:
-      - image-build-amd
 
 - label: V1 e2e (4 GPUs)
   key: v1-e2e-4-gpus
@@ -73,11 +69,6 @@ steps:
   commands:
     # Only run tests that need 4 GPUs
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
-  mirror:
-    amd:
-      device: mi325_4
-      depends_on:
-      - image-build-amd
 
 - label: V1 e2e (4xH100)
   key: v1-e2e-4xh100
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 4908df1e4eca..ba92d3a3aec0 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -26,6 +26,11 @@ steps:
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  mirror:
+    amd:
+      device: mi300_1
+      depends_on:
+      - image-build-amd
 
 - label: Entrypoints Integration (API Server openai - Part 1)
   key: entrypoints-integration-api-server-openai-part-1
@@ -38,11 +43,6 @@ steps:
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 
 - label: Entrypoints Integration (API Server openai - Part 2)
@@ -57,11 +57,6 @@ steps:
   - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
   - pytest -v -s entrypoints/openai/speech_to_text/
   - pytest -v -s entrypoints/test_chat_utils.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Entrypoints Integration (API Server openai - Part 3)
   key: entrypoints-integration-api-server-openai-part-3
diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml
index b7cd1eae65a4..34e1e4832d9d 100644
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -125,7 +125,7 @@ steps:
   key: kernels-b200
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   # optional: true
   source_file_dependencies:
   - csrc/quantization/fp4/
@@ -212,7 +212,7 @@ steps:
 - label: Kernels Fp4 MoE Test (B200)
   key: kernels-fp4-moe-test-b200
   timeout_in_minutes: 60
-  device: b200
+  device: b200-k8s
   num_devices: 1
   optional: true
   commands:
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index 67b87ab6921f..e5a163d17c7e 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -51,7 +51,7 @@ steps:
 - label: LM Eval Qwen3.5 Models (B200)
   key: lm-eval-qwen3-5-models-b200
   timeout_in_minutes: 120
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   source_file_dependencies:
@@ -84,7 +84,7 @@ steps:
   
 - label: MoE Refactor Integration Test (B200 - TEMPORARY)
   key: moe-refactor-integration-test-b200-temporary
-  device: b200
+  device: b200-k8s
   optional: true
   num_devices: 2
   commands:
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index 3ff96dcbdfb7..877d931a12ef 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -12,11 +12,6 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     # TODO: create another `optional` test group for slow tests
     - pytest -v -s -m 'not slow_test' v1/spec_decode
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: V1 Sample + Logits
   key: v1-sample-logits
@@ -38,7 +33,7 @@ steps:
     - pytest -v -s v1/test_outputs.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
@@ -67,11 +62,6 @@ steps:
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: V1 Others (CPU)
   key: v1-others-cpu
@@ -127,8 +117,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
@@ -200,7 +190,7 @@ steps:
   - pytest -v -s -m 'cpu_test' multimodal
   - pytest -v -s renderers
   - pytest -v -s tokenizers_
-  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py --ignore=reasoning/test_gemma4_reasoning_parser.py
+  - pytest -v -s reasoning --ignore=reasoning/test_seedoss_reasoning_parser.py --ignore=reasoning/test_glm4_moe_reasoning_parser.py
   - pytest -v -s tool_parsers
   - pytest -v -s transformers_utils
   - pytest -v -s config
@@ -224,7 +214,7 @@ steps:
 - label: Batch Invariance (B200)
   key: batch-invariance-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
     - vllm/v1/attention
     - vllm/model_executor/layers
diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml
index 6a4338a5e40a..9dfd046289e8 100644
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -37,7 +37,7 @@ steps:
     - examples/generate/multimodal/
     - examples/features/
     - examples/pooling/embed/vision_embedding_offline.py
-    - examples/others/tensorize_vllm_model.py
+    - examples/features/tensorize_vllm_model.py
   commands:
     - set -x
     - export VLLM_USE_V2_MODEL_RUNNER=1
@@ -55,8 +55,8 @@ steps:
     - python3 pooling/embed/vision_embedding_offline.py --seed 0
     # for features demo
     - python3 features/automatic_prefix_caching/prefix_caching_offline.py
-    - python3 offline_inference/llm_engine_example.py
-    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 deployment/llm_engine_example.py
+    - python3 features/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 features/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 features/speculative_decoding/spec_decode_offline.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index ab597f960e74..8fca203de44f 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -42,12 +42,6 @@ steps:
   - tests/models/test_registry.py
   commands:
     - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
 
 - label: Basic Models Test (Other CPU) # 5min
   key: basic-models-test-other-cpu
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index 64f8fc809d46..b560c5a4769a 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -48,6 +48,14 @@ steps:
   parallelism: 2
   mirror:
     torch_nightly: {}
+    amd:
+      device: mi300_1
+      depends_on:
+      - image-build-amd
+      commands:
+      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
+      - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Test (Extended Generation) # 80min
   key: language-models-test-extended-generation
@@ -62,15 +70,6 @@ steps:
     - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0'
     - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.6.0'
-      - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 
 - label: Language Models Test (PPL)
   key: language-models-test-ppl
@@ -92,11 +91,6 @@ steps:
   - tests/models/language/pooling
   commands:
     - pytest -v -s models/language/pooling -m 'not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Language Models Test (MTEB)
   key: language-models-test-mteb
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index 857cc68f2a92..1f66393df818 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -15,7 +15,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
@@ -33,7 +33,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_vit_cudagraph.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
@@ -49,7 +49,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
@@ -64,11 +64,6 @@ steps:
     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py  --ignore models/multimodal/generation/test_memory_leak.py --ignore models/multimodal/processing
     - pytest models/multimodal/generation/test_memory_leak.py -m core_model
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Multi-Modal Processor (CPU)
   key: multi-modal-processor-cpu
@@ -120,7 +115,7 @@ steps:
     - pytest -v -s models/multimodal/test_mapping.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi300_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index 0ccca528401b..8a9a36da4481 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -25,7 +25,7 @@ steps:
   key: quantized-moe-test-b200
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
-  device: b200
+  device: b200-k8s
   source_file_dependencies:
   - tests/quantization/test_blackwell_moe.py
   - vllm/model_executor/models/deepseek_v2.py
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
index 37f8eaa6883c..48e9f55571e4 100644
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -17,7 +17,7 @@ steps:
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
       commands:
diff --git a/.buildkite/test_areas/spec_decode.yaml b/.buildkite/test_areas/spec_decode.yaml
index 726043769c91..5253f54735aa 100644
--- a/.buildkite/test_areas/spec_decode.yaml
+++ b/.buildkite/test_areas/spec_decode.yaml
@@ -75,7 +75,7 @@ steps:
 - label: Spec Decode Draft Model Nightly B200
   key: spec-decode-draft-model-nightly-b200
   timeout_in_minutes: 30
-  device: b200
+  device: b200-k8s
   optional: true
   source_file_dependencies:
     - vllm/v1/spec_decode/
diff --git a/.github/mergify.yml b/.github/mergify.yml
index de3c76fd458b..2d36e3507028 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -477,9 +477,7 @@ pull_request_rules:
   conditions:
     - label != stale
     - or:
-      - files~=^examples/online_serving/disaggregated[^/]*/.*
-      - files~=^examples/offline_inference/disaggregated[^/]*/.*
-      - files~=^examples/others/lmcache/
+      - files~=^examples/disaggregated/
       - files~=^tests/v1/kv_connector/
       - files~=^vllm/distributed/kv_transfer/
       - title~=(?i)\bP/?D\b
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 33b1db69dec4..f1e0afebf213 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -131,6 +131,16 @@ repos:
         --python-version, "3.12",
       ]
       files: ^requirements/(common|xpu|test/xpu)\.(in|txt)$
+    - id: pip-compile
+      alias: pip-compile-docs
+      name: pip-compile-docs
+      args: [
+        requirements/docs.in,
+        -o, requirements/docs.txt,
+        --python-platform, x86_64-manylinux_2_28,
+        --python-version, "3.12",
+      ]
+      files: ^requirements/docs\.(in|txt)$
 - repo: local
   hooks:
   - id: format-torch-nightly-test
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 1e479fd03d91..f372a3fb8cc9 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -9,7 +9,6 @@ build:
     python: "3.12"
   jobs:
     post_checkout:
-      # - bash docs/maybe_skip_pr_build.sh
       - git fetch origin main --unshallow --no-tags --filter=blob:none || true
     pre_create_environment:
       - pip install uv
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf4ac05e4f29..13788fa87437 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -307,12 +307,12 @@ set(VLLM_EXT_SRC
   "csrc/quantization/activation_kernels.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/custom_all_reduce.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/torch_bindings.cpp"
+  "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_EXT_SRC
-    "csrc/minimax_reduce_rms_kernel.cu"
-    "csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu")
+    "csrc/minimax_reduce_rms_kernel.cu")
 
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
@@ -1047,13 +1047,13 @@ endif()
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/topk_softmax_kernels.cu")
+  "csrc/moe/topk_softmax_kernels.cu"
+  "csrc/moe/topk_softplus_sqrt_kernels.cu")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_MOE_EXT_SRC
     "csrc/moe/moe_wna16.cu"
-    "csrc/moe/grouped_topk_kernels.cu"
-    "csrc/moe/topk_softplus_sqrt_kernels.cu")
+    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index 3e3c79217982..2f56099c66fd 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -1473,6 +1473,12 @@ async def main() -> None:
         "(for example: --warmup-percentages=0%%,50%%)",
     )
 
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code when loading the tokenizer.",
+    )
+
     args = parser.parse_args()
 
     logger.info(args)
@@ -1515,7 +1521,9 @@ async def main() -> None:
     np.random.seed(args.seed)
 
     logger.info("Loading tokenizer")
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code
+    )
 
     await get_server_info(args.url)
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 8535186cc1ec..d27a5ea93dea 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -32,18 +32,23 @@ else()
         "-DVLLM_CPU_EXTENSION")
 
     # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
-    # and create a local shim dir with it
+    # and create a local shim dir with it. When PyTorch is built from source or packaged
+    # by a distro (common on RISC-V, s390x, Fedora/RHEL aarch64), no vendored libgomp
+    # exists and the shim dir is empty; fall back to the system libgomp in that case.
     vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
 
-    find_library(OPEN_MP
-        NAMES gomp
-        PATHS ${VLLM_TORCH_GOMP_SHIM_DIR}
-        NO_DEFAULT_PATH
-        REQUIRED
-    )
-    # Set LD_LIBRARY_PATH to include the shim dir at build time to use the same libgomp as PyTorch
-    if (OPEN_MP)
+    if(VLLM_TORCH_GOMP_SHIM_DIR)
+        find_library(OPEN_MP
+            NAMES gomp
+            PATHS "${VLLM_TORCH_GOMP_SHIM_DIR}"
+            NO_DEFAULT_PATH
+            REQUIRED
+        )
+        # Use the same libgomp as PyTorch at runtime
         set(ENV{LD_LIBRARY_PATH} "${VLLM_TORCH_GOMP_SHIM_DIR}:$ENV{LD_LIBRARY_PATH}")
+    else()
+        # Fall back to system / toolchain libgomp
+        find_library(OPEN_MP NAMES gomp REQUIRED)
     endif()
 endif()
 
@@ -321,14 +326,6 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
     set(ONEDNN_VERBOSE "ON")
     set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 
-    # TODO: Refactor this
-    if (ENABLE_X86_ISA)
-        # Note: only enable oneDNN for AVX512
-        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS_AVX512})
-    else()
-        list(APPEND DNNL_COMPILE_FLAGS ${CXX_COMPILE_FLAGS})
-    endif()
-
     set(VLLM_BUILD_TYPE ${CMAKE_BUILD_TYPE})
     set(CMAKE_BUILD_TYPE "Release") # remove oneDNN debug symbols to reduce size
     FetchContent_MakeAvailable(oneDNN)
@@ -341,8 +338,14 @@ if (ENABLE_X86_ISA OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND
         PRIVATE ${oneDNN_SOURCE_DIR}/src
     )
     target_link_libraries(dnnl_ext dnnl torch)
-    target_compile_options(dnnl_ext PRIVATE ${DNNL_COMPILE_FLAGS} -fPIC)
+    if (ENABLE_X86_ISA)
+        target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS_AVX2} -fPIC)
+    else()
+        target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+    endif()
     list(APPEND LIBS dnnl_ext)
+
+
     set(USE_ONEDNN ON)
 else()
     set(USE_ONEDNN OFF)
@@ -406,12 +409,15 @@ endif()
 
 if (ENABLE_X86_ISA)
     set(VLLM_EXT_SRC_SGL
+        "csrc/cpu/sgl-kernels/fla.cpp"
+        "csrc/cpu/sgl-kernels/conv.cpp"
         "csrc/cpu/sgl-kernels/gemm.cpp"
         "csrc/cpu/sgl-kernels/gemm_int8.cpp"
         "csrc/cpu/sgl-kernels/gemm_fp8.cpp"
         "csrc/cpu/sgl-kernels/gemm_int4.cpp"
         "csrc/cpu/sgl-kernels/moe.cpp"
         "csrc/cpu/sgl-kernels/moe_int8.cpp"
+        "csrc/cpu/sgl-kernels/moe_int4.cpp"
         "csrc/cpu/sgl-kernels/moe_fp8.cpp")
 
     set(VLLM_EXT_SRC_AVX512
@@ -430,10 +436,11 @@ if (ENABLE_X86_ISA)
         "csrc/cpu/pos_encoding.cpp"
         "csrc/moe/dynamic_4bit_int_moe_cpu.cpp") 
 
-    set(VLLM_EXT_SRC_AVX2 
+    set(VLLM_EXT_SRC_AVX2
         "csrc/cpu/utils.cpp"
         "csrc/cpu/spec_decode_utils.cpp"
         "csrc/cpu/cpu_attn.cpp"
+        "csrc/cpu/dnnl_kernels.cpp"
         "csrc/cpu/torch_bindings.cpp"
         # TODO: Remove these files
         "csrc/cpu/activation.cpp"
@@ -448,7 +455,7 @@ if (ENABLE_X86_ISA)
 
     set(_C_LIBS numa dnnl_ext)
     set(_C_AVX512_LIBS numa dnnl_ext)
-    set(_C_AVX2_LIBS numa)
+    set(_C_AVX2_LIBS numa dnnl_ext)
 
     # AMX + AVX512F + AVX512BF16 + AVX512VNNI
     define_extension_target(
diff --git a/cmake/external_projects/deepgemm.cmake b/cmake/external_projects/deepgemm.cmake
index b821b90ec8e9..0d7ea43fb7d0 100644
--- a/cmake/external_projects/deepgemm.cmake
+++ b/cmake/external_projects/deepgemm.cmake
@@ -59,26 +59,11 @@ if(DEEPGEMM_ARCHS)
   # Build the _C pybind11 extension from DeepGEMM's C++ source.
   # This is a CXX-only module — CUDA kernels are JIT-compiled at runtime.
   #
-  # Free-threaded Python doesn't yet support the stable ABI, so skip USE_SABI
-  # there. (The other vLLM extensions get this guard for free via
-  # define_extension_target; this target uses raw Python_add_library.)
-  run_python(IS_FREETHREADED_PYTHON
-    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
-    "Failed to determine whether interpreter is free-threaded")
-  if (NOT IS_FREETHREADED_PYTHON)
-    Python_add_library(_deep_gemm_C MODULE WITH_SOABI USE_SABI 3
-      "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
-  else()
-    Python_add_library(_deep_gemm_C MODULE WITH_SOABI
-      "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
-  endif()
+  Python_add_library(_deep_gemm_C MODULE WITH_SOABI
+    "${deepgemm_SOURCE_DIR}/csrc/python_api.cpp")
 
   # The pybind11 module name must be _C to match DeepGEMM's Python imports.
-  # Place the build artifact in a subdir so it doesn't collide with vLLM's own
-  # `_C.abi3.so` in the build tree (the install destination still differs).
-  set_target_properties(_deep_gemm_C PROPERTIES
-    OUTPUT_NAME "_C"
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/deep_gemm")
+  set_target_properties(_deep_gemm_C PROPERTIES OUTPUT_NAME "_C")
 
   target_compile_definitions(_deep_gemm_C PRIVATE
     "-DTORCH_EXTENSION_NAME=_C")
@@ -90,15 +75,11 @@ if(DEEPGEMM_ARCHS)
     "${deepgemm_SOURCE_DIR}/third-party/cutlass/tools/util/include"
     "${deepgemm_SOURCE_DIR}/third-party/fmt/include")
 
-  # Keep Stable ABI for the module, but *not* for CUDA/C++ files.
-  # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
   target_compile_options(_deep_gemm_C PRIVATE
     $<$<COMPILE_LANGUAGE:CXX>:-std=c++17>
     $<$<COMPILE_LANGUAGE:CXX>:-O3>
     $<$<COMPILE_LANGUAGE:CXX>:-Wno-psabi>
-    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>
-    $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
-    $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
+    $<$<COMPILE_LANGUAGE:CXX>:-Wno-deprecated-declarations>)
 
   # torch_python is required because DeepGEMM uses pybind11 type casters
   # for at::Tensor (via PYBIND11_MODULE), unlike vLLM's own extensions which
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 7e456d32598b..895490f45a79 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -97,13 +97,13 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
 
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  // Use cuMemcpyBatchAsync (CUDA 12.8+) to submit all copies in a single
-  // driver call, amortizing per-copy submission overhead.
-  // int64_t and CUdeviceptr/size_t are both 8 bytes on 64-bit platforms,
-  // so we reinterpret_cast the tensor data directly to avoid copies.
-  static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
+  // Use cuMemcpyBatchAsync / hipMemcpyBatchAsync to submit all copies in a
+  // single driver call, amortizing per-copy submission overhead. int64_t
+  // and CUdeviceptr/void*/size_t are all 8 bytes on 64-bit platforms, so we
+  // reinterpret_cast the tensor data directly to avoid copies.
   static_assert(sizeof(size_t) == sizeof(int64_t));
 #if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION >= 12080
+  static_assert(sizeof(CUdeviceptr) == sizeof(int64_t));
   // Resolve cuMemcpyBatchAsync at runtime via cuGetProcAddress so that
   // binaries compiled with CUDA 12.8+ still work on older drivers, and
   // we avoid the CUDA 13.0 header remapping (#define to _v2 signature).
@@ -134,12 +134,30 @@ void swap_blocks_batch(const torch::Tensor& src_ptrs,
                                &fail_idx, static_cast<CUstream>(stream));
     TORCH_CHECK(result == CUDA_SUCCESS, "cuMemcpyBatchAsync failed at index ",
                 fail_idx, " with error ", result);
-  } else
+    return;
+  }
+#elif defined(USE_ROCM) && defined(HIP_VERSION) && HIP_VERSION >= 70100000
+  // ROCm 7.1+ exposes hipMemcpyBatchAsync. The 7.2.1 implementation early-
+  // returns hipErrorNotSupported whenever numAttrs > 0 (see ROCm/clr @
+  // rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so call with
+  // numAttrs=0.
+  {
+    hipMemcpyAttributes attr = {};
+    size_t attrs_idx = 0;
+    size_t fail_idx = 0;
+    hipError_t result = hipMemcpyBatchAsync(
+        reinterpret_cast<void**>(dst_data), reinterpret_cast<void**>(src_data),
+        reinterpret_cast<size_t*>(size_data), static_cast<size_t>(n), &attr,
+        &attrs_idx, 0, &fail_idx, static_cast<hipStream_t>(stream));
+    TORCH_CHECK(result == hipSuccess, "hipMemcpyBatchAsync failed at index ",
+                fail_idx, " with error ", result);
+    return;
+  }
 #endif
   {
-    // Fallback for CUDA < 12.8, older drivers, and ROCm:
-    // individual async copies.
-    // cudaMemcpyDefault lets the driver infer direction from pointer types.
+    // Fallback for CUDA < 12.8, older CUDA drivers, and ROCm < 7.1:
+    // individual async copies. cudaMemcpyDefault lets the driver infer
+    // direction from pointer types.
     for (int64_t i = 0; i < n; i++) {
       cudaMemcpyAsync(reinterpret_cast<void*>(dst_data[i]),
                       reinterpret_cast<void*>(src_data[i]),
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 18afe4b7925c..4750dd78838d 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -29,6 +29,8 @@ torch::Tensor get_scheduler_metadata(
     isa = cpu_attention::ISA::NEON;
   } else if (isa_hint == "vxe") {
     isa = cpu_attention::ISA::VXE;
+  } else if (isa_hint == "vsx") {
+    isa = cpu_attention::ISA::VSX;
   } else {
     TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
   }
@@ -129,6 +131,8 @@ void cpu_attn_reshape_and_cache(
       return cpu_attention::ISA::NEON;
     } else if (isa == "vxe") {
       return cpu_attention::ISA::VXE;
+    } else if (isa == "vsx") {
+      return cpu_attention::ISA::VSX;
     } else {
       TORCH_CHECK(false, "Invalid ISA type: " + isa);
     }
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index f5b473bd262a..b9987fb26c19 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -12,7 +12,7 @@
 #include "cpu/utils.hpp"
 
 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16, NEON, VXE };
+enum class ISA { AMX, VEC, VEC16, NEON, VXE, VSX };
 
 // Mirrors csrc/attention/dtype_fp8.cuh Fp8KVCacheDataType exactly.
 enum class Fp8KVCacheDataType {
@@ -164,6 +164,9 @@ struct AttentionMetadata {
       case ISA::VXE:
         ss << "VXE, ";
         break;
+      case ISA::VSX:
+        ss << "VSX, ";
+        break;
     }
     ss << "workitem_group_num: " << workitem_group_num
        << ", reduction_item_num: " << reduction_item_num
diff --git a/csrc/cpu/cpu_attn_vec.hpp b/csrc/cpu/cpu_attn_vec.hpp
index 61cae12d67da..c3983e0578a5 100644
--- a/csrc/cpu/cpu_attn_vec.hpp
+++ b/csrc/cpu/cpu_attn_vec.hpp
@@ -27,8 +27,8 @@ FORCE_INLINE std::pair<vec_op::FP32Vec16, vec_op::FP32Vec16> load_b_pair_vec(
     return {vec_op::FP32Vec16(bf16_b_reg, 0), vec_op::FP32Vec16(bf16_b_reg, 1)};
   } else {
     using load_vec_t = typename VecTypeTrait<kv_cache_t>::vec_t;
-    return {vec_op::FP32Vec16(load_vec_t(ptr)),
-            vec_op::FP32Vec16(load_vec_t(ptr + 16))};
+    return std::make_pair(vec_op::FP32Vec16(load_vec_t(ptr)),
+                          vec_op::FP32Vec16(load_vec_t(ptr + 16)));
   }
 }
 
diff --git a/csrc/cpu/cpu_attn_vsx.hpp b/csrc/cpu/cpu_attn_vsx.hpp
new file mode 100644
index 000000000000..c7e1502bcb05
--- /dev/null
+++ b/csrc/cpu/cpu_attn_vsx.hpp
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#ifndef CPU_ATTN_VSX_HPP
+#define CPU_ATTN_VSX_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <altivec.h>
+#include <type_traits>
+
+namespace cpu_attention {
+
+namespace {
+
+// ppc64le Vector = 16 bytes (128 bits)
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, __vector float& b0,
+                                     __vector float& b1);
+
+// [1] Float Specialization
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, __vector float& b0,
+                                            __vector float& b1) {
+  b0 = vec_xl(0, const_cast<float*>(p));
+  b1 = vec_xl(0, const_cast<float*>(p + 4));
+}
+
+// [2] BFloat16 Specialization (Little Endian ppc64le)
+// On ppc64le (LE): BF16 bits should land in the HIGH 16 bits of each float32.
+// Byte layout of float32 on LE: [byte0(LSB), byte1, byte2, byte3(MSB)]
+// We need BF16 in bytes2-3 (high half) with bytes0-1 zeroed.
+// vec_mergeh on LE interleaves elements 0..3: result_i = {a[i], b[i]}
+// So vec_mergeh(zeros_u16, raw_u16) gives for each uint16 pair:
+//   uint16[2i]   = zeros[i]  -> low 16 bits of uint32  -> zeroed mantissa LSBs
+//   uint16[2i+1] = raw[i]    -> high 16 bits of uint32 -> BF16 bits
+// Cast to float32 gives exactly (bf16_bits << 16) per element.
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    __vector float& b0,
+                                                    __vector float& b1) {
+  __vector unsigned short raw = vec_xl(
+      0, reinterpret_cast<unsigned short*>(const_cast<c10::BFloat16*>(p)));
+  __vector unsigned short zeros = vec_splat_u16(0);
+
+  // LE: zeros in low 16 bits, raw in high 16 bits → bf16 << 16 == float32
+  b0 = (__vector float)vec_mergeh(zeros, raw);
+  b1 = (__vector float)vec_mergel(zeros, raw);
+}
+
+// Note: c10::Half (FP16) is not supported on PowerPC architecture
+
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_ppc64le_Mx8_Ku4(
+    const float* __restrict A,       // [M x K]
+    const kv_cache_t* __restrict B,  // [K x 8]
+    float* __restrict C,             // [M x 8]
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // 1. Define A pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // 2. Define Accumulators (2 vectors covers 8 columns)
+#define DECL_ACC(i) __vector float acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // 3. Initialize Accumulators (Load C or Zero)
+#define INIT_ACC(i)                                                  \
+  IF_M(i) {                                                          \
+    if (accumulate) {                                                \
+      acc##i##_0 = vec_xl(0, const_cast<float*>(C + (i) * ldc + 0)); \
+      acc##i##_1 = vec_xl(0, const_cast<float*>(C + (i) * ldc + 4)); \
+    } else {                                                         \
+      acc##i##_0 = vec_splats(0.0f);                                 \
+      acc##i##_1 = vec_splats(0.0f);                                 \
+    }                                                                \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  for (; k + 3 < K; k += 4) {
+    // Load 4 values of A for each Row M: A[k...k+3]
+#define LOAD_A4(i)        \
+  __vector float a##i##v; \
+  IF_M(i) a##i##v = vec_xl(0, const_cast<float*>(a##i + k));
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // FMA for specific lane L of A
+    // ppc64le: vec_madd(b, vec_splat(a, lane), acc)
+#define FMAS_LANE(i, aiv, L)                        \
+  IF_M(i) {                                         \
+    __vector float a_broad = vec_splat(aiv, L);     \
+    acc##i##_0 = vec_madd(b0, a_broad, acc##i##_0); \
+    acc##i##_1 = vec_madd(b1, a_broad, acc##i##_1); \
+  }
+
+    // Unroll K=0..3
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    {
+      __vector float b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  for (; k < K; ++k) {
+    __vector float b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                              \
+  IF_M(i) {                                      \
+    __vector float ai = vec_splats(*(a##i + k)); \
+    acc##i##_0 = vec_madd(b0, ai, acc##i##_0);   \
+    acc##i##_1 = vec_madd(b1, ai, acc##i##_1);   \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+#define STORE_ROW(i)                           \
+  IF_M(i) {                                    \
+    vec_xst(acc##i##_0, 0, C + (i) * ldc + 0); \
+    vec_xst(acc##i##_1, 0, C + (i) * ldc + 4); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_ppc64le_Mx8_Ku4(const float* __restrict A,
+                                             const kv_cache_t* __restrict B,
+                                             float* __restrict C, int32_t M,
+                                             int32_t K, int64_t lda,
+                                             int64_t ldb, int64_t ldc,
+                                             bool accumulate) {
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_ppc64le_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        case 4:
+          gemm_micro_ppc64le_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        case 2:
+          gemm_micro_ppc64le_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+        default:
+          gemm_micro_ppc64le_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                    K, accumulate);
+          break;
+      }
+    }
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmPPC64 {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_ppc64le_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_ppc64le_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::VSX, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment = BLOCK_SIZE_ALIGNMENT;
+  constexpr static int64_t HeadDimAlignment = HEAD_SIZE_ALIGNMENT;
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::VSX;
+  constexpr static bool scale_on_logits =
+      false;  // Scale is applied to Q during copy
+
+ public:
+  AttentionImpl() {}
+
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmPPC64<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // Strides for Memory Layout
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // [head_dim, block_size] layout
+  }
+
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;
+  }
+
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;
+  }
+
+  static void copy_q_heads_tile(scalar_t* __restrict__ src,
+                                float* __restrict__ q_buffer,
+                                const int32_t q_num,
+                                const int32_t q_heads_per_kv,
+                                const int64_t q_num_stride,
+                                const int64_t q_head_stride, float scale) {
+    __vector float scale_vec = vec_splats(scale);
+    constexpr bool is_bf16 = std::is_same<scalar_t, c10::BFloat16>::value;
+
+    for (int32_t i = 0; i < q_num; ++i) {
+      for (int32_t h = 0; h < q_heads_per_kv; ++h) {
+        scalar_t* curr_src = src + i * q_num_stride + h * q_head_stride;
+        float* curr_dst =
+            q_buffer + i * q_heads_per_kv * head_dim + h * head_dim;
+
+        int32_t d = 0;
+        for (; d <= head_dim - 8; d += 8) {
+          __vector float v0, v1;
+          load_row8_B_as_f32<scalar_t>(curr_src + d, v0, v1);
+
+          v0 = vec_mul(v0, scale_vec);
+          v1 = vec_mul(v1, scale_vec);
+
+          vec_xst(v0, 0, curr_dst + d);
+          vec_xst(v1, 0, curr_dst + d + 4);
+        }
+
+        for (; d < head_dim; ++d) {
+          float val = static_cast<float>(curr_src[d]);
+          curr_dst[d] = val * scale;
+        }
+      }
+    }
+  }
+
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride,
+      const float k_inv = 0.0f, const float v_inv = 0.0f) {
+    // k_inv and v_inv are unused on VSX: FP8 KV cache is not supported on
+    // PowerPC. The parameters are present to match the common interface.
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) continue;
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+
+        {
+          const scalar_t* key_src = key + token_idx * key_token_num_stride +
+                                    head_idx * key_head_num_stride;
+          scalar_t* key_dst = key_cache + block_idx * num_blocks_stride +
+                              head_idx * cache_head_num_stride + block_offset;
+
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_dst[j] = key_src[i];
+          }
+        }
+
+        {
+          const scalar_t* val_src = value + token_idx * value_token_num_stride +
+                                    head_idx * value_head_num_stride;
+          scalar_t* val_dst = value_cache + block_idx * num_blocks_stride +
+                              head_idx * cache_head_num_stride +
+                              block_offset * head_dim;
+
+          std::memcpy(val_dst, val_src, sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace cpu_attention
+
+#undef BLOCK_SIZE_ALIGNMENT
+#undef HEAD_SIZE_ALIGNMENT
+#undef MAX_Q_HEAD_NUM_PER_ITER
+
+#endif  // CPU_ATTN_VSX_HPP
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index c25713052725..b408731f40d1 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -486,6 +486,10 @@ struct FP32Vec16 : public VectorizedRegWrapper<FP32Vec16, 4, float> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
+  // FP8 stub: dead code on ARM (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : Base() {}
+
   explicit FP32Vec16(const FP16Vec16& v) {
     reg.val[0] = Vectorized<float>(vcvt_f32_f16(vget_low_f16(v.reg.val[0])));
     reg.val[1] = Vectorized<float>(vcvt_f32_f16(vget_high_f16(v.reg.val[0])));
diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp
index f9da78283da5..d1c2fc85933a 100644
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -6,6 +6,9 @@
 
 namespace vec_op {
 
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)            \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
@@ -145,6 +148,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   }
 
   void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -302,6 +308,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
+  // FP8 stub: dead code on scalar path (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   FP32Vec16 operator*(const FP32Vec16& b) const {
     f32x16_t ret;
     unroll_loop<int, VEC_ELEM_NUM>(
diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp
index 089b9840ea2e..87c7a9dd51f4 100644
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@@ -9,6 +9,10 @@
 
 namespace vec_op {
 
+// FP8 tag types for tag dispatch (see cpu_attn_vec.hpp)
+struct fp8_e4m3_tag {};
+struct fp8_e5m2_tag {};
+
 // FIXME: FP16 is not fully supported in Torch-CPU
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
@@ -142,6 +146,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
       : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
 
   void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+
+  explicit BF16Vec32(const uint8_t*, fp8_e4m3_tag) : reg{} {}
+  explicit BF16Vec32(const uint8_t*, fp8_e5m2_tag) : reg{} {}
 };
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
@@ -404,6 +411,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
+  // FP8 stub: dead code on PowerPC (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   explicit FP32Vec16(const INT32Vec16& v) {
     reg.val[0] = vec_ctf(v.reg.val[0], 0);
     reg.val[1] = vec_ctf(v.reg.val[1], 0);
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 90a2dd918bd9..2e0af466b649 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -688,6 +688,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
 
+  // FP8 stub: dead code on s390x (fp8 KV cache is x86-only), needed for
+  // load_b_pair_vec template to compile on all platforms.
+  explicit FP32Vec16(const BF16Vec32&, int) : reg{} {}
+
   FP32Vec16 operator*(const FP32Vec16& b) const {
     return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
                                 vec_mul(reg.val[1], b.reg.val[1]),
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index c7b2fe3071fb..396b9b7e041f 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -122,9 +122,17 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
+#ifdef __AVX512BW__
     constexpr uint32_t M = 0xFFFFFFFF;
     __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
     _mm256_mask_storeu_epi16(ptr, mask, reg);
+#else
+    // Fallback for lack of 16-bit masked store
+    int16_t tmp[VEC_ELEM_NUM];
+    _mm256_storeu_si256((__m256i*)tmp, reg);
+    for (int i = 0; i < elem_num; ++i)
+      reinterpret_cast<int16_t*>(ptr)[i] = tmp[i];
+#endif
   }
 };
 
@@ -161,9 +169,17 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
 
   void save(void* ptr, const int elem_num) const {
+#ifdef __AVX512BW__
     constexpr uint32_t M = 0xFFFFFFFF;
     __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
     _mm256_mask_storeu_epi16(ptr, mask, reg);
+#else
+    // Fallback for lack of 16-bit masked store
+    int16_t tmp[VEC_ELEM_NUM];
+    _mm256_storeu_si256((__m256i*)tmp, reg);
+    for (int i = 0; i < elem_num; ++i)
+      reinterpret_cast<int16_t*>(ptr)[i] = tmp[i];
+#endif
   }
 };
 
@@ -247,13 +263,12 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(__m256i low, __m256i high)
       : reg_low(low), reg_high(high) {}
 
+  explicit BF16Vec32()
+      : reg_low(_mm256_setzero_si256()), reg_high(_mm256_setzero_si256()) {}
+
   explicit BF16Vec32(BF16Vec8& vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-            _mm256_castsi128_si256((__m128i)vec8_data.reg),
-            (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-            _mm256_castsi128_si256((__m128i)vec8_data.reg),
-            (__m128i)vec8_data.reg, 1)) {}
+      : reg_low(_mm256_broadcastsi128_si256((__m128i)vec8_data.reg)),
+        reg_high(_mm256_broadcastsi128_si256((__m128i)vec8_data.reg)) {}
 
   // E4M3 decode (AVX2 path) — same bit-layout trick as the AVX512 variant
   // above.  Result = true_E4M3 * 2^-8; caller applies scale * 2^8.
@@ -674,6 +689,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
                      _mm256_sub_ps(reg_high, b.reg_high));
   }
 
+  FP32Vec16 operator-() const {
+    const __m256 neg = _mm256_set1_ps(-0.0f);
+    return FP32Vec16(_mm256_xor_ps(reg_low, neg), _mm256_xor_ps(reg_high, neg));
+  }
+
   FP32Vec16 operator/(const FP32Vec16& b) const {
     return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
                      _mm256_div_ps(reg_high, b.reg_high));
@@ -739,6 +759,85 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     _mm256_storeu_ps(ptr, reg_low);
     _mm256_storeu_ps(ptr + 8, reg_high);
   }
+
+  void save(float* ptr, const int elem_num) const {
+    // Partial store: cmpgt produces a sign-bit mask (0xFFFFFFFF/0 per lane)
+    // for the first elem_num lanes, applied across the two 8-wide halves.
+    if (elem_num <= 8) {
+      __m256i mask =
+          _mm256_cmpgt_epi32(_mm256_set1_epi32(elem_num),
+                             _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
+      _mm256_maskstore_ps(ptr, mask, reg_low);
+    } else {
+      _mm256_storeu_ps(ptr, reg_low);
+      __m256i mask =
+          _mm256_cmpgt_epi32(_mm256_set1_epi32(elem_num - 8),
+                             _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
+      _mm256_maskstore_ps(ptr + 8, mask, reg_high);
+    }
+  }
+
+  FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
+    return FP32Vec16(
+        _mm256_min_ps(max.reg_low, _mm256_max_ps(min.reg_low, reg_low)),
+        _mm256_min_ps(max.reg_high, _mm256_max_ps(min.reg_high, reg_high)));
+  }
+
+  FP32Vec16 abs() const {
+    const __m256 sign_mask = _mm256_set1_ps(-0.0f);
+    return FP32Vec16(_mm256_andnot_ps(sign_mask, reg_low),
+                     _mm256_andnot_ps(sign_mask, reg_high));
+  }
+
+  FP32Vec16 min(const FP32Vec16& b) const {
+    return FP32Vec16(_mm256_min_ps(reg_low, b.reg_low),
+                     _mm256_min_ps(reg_high, b.reg_high));
+  }
+
+  // Partial element-wise min over the first elem_num lanes only (tail path).
+  // Scalar via AliasReg: AVX2 has no masked vminps, so we spill, loop, reload.
+  FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
+    AliasReg ar_this_low, ar_this_high, ar_b_low, ar_b_high;
+    ar_this_low.reg = reg_low;
+    ar_this_high.reg = reg_high;
+    ar_b_low.reg = b.reg_low;
+    ar_b_high.reg = b.reg_high;
+    for (int i = 0; i < elem_num && i < 8; ++i)
+      ar_this_low.values[i] =
+          std::min(ar_this_low.values[i], ar_b_low.values[i]);
+    for (int i = 0; i < elem_num - 8 && i < 8; ++i)
+      ar_this_high.values[i] =
+          std::min(ar_this_high.values[i], ar_b_high.values[i]);
+    return FP32Vec16(ar_this_low.reg, ar_this_high.reg);
+  }
+
+  // Partial element-wise max over the first elem_num lanes only (tail path).
+  // Scalar via AliasReg: AVX2 has no masked vmaxps, so we spill, loop, reload.
+  FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
+    AliasReg ar_this_low, ar_this_high, ar_b_low, ar_b_high;
+    ar_this_low.reg = reg_low;
+    ar_this_high.reg = reg_high;
+    ar_b_low.reg = b.reg_low;
+    ar_b_high.reg = b.reg_high;
+    for (int i = 0; i < elem_num && i < 8; ++i)
+      ar_this_low.values[i] =
+          std::max(ar_this_low.values[i], ar_b_low.values[i]);
+    for (int i = 0; i < elem_num - 8 && i < 8; ++i)
+      ar_this_high.values[i] =
+          std::max(ar_this_high.values[i], ar_b_high.values[i]);
+    return FP32Vec16(ar_this_low.reg, ar_this_high.reg);
+  }
+
+  float reduce_min() const {
+    __m256 v = _mm256_min_ps(reg_low, reg_high);
+    __m256 v_shuffled = _mm256_permute_ps(v, 0b00001011);
+    __m256 v_min = _mm256_min_ps(v, v_shuffled);
+    v_shuffled = _mm256_permute_ps(v_min, 0b00000001);
+    v_min = _mm256_min_ps(v_min, v_shuffled);
+    v_shuffled = _mm256_permute2f128_ps(v_min, v_min, 0b00000001);
+    v_min = _mm256_min_ps(v_min, v_shuffled);
+    return _mm256_cvtss_f32(v_min);
+  }
 };
 #endif
 
@@ -791,6 +890,34 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
   // non-temporal save
   void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
+#else
+struct INT8Vec16 : public Vec<INT8Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m128i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m128i reg;
+
+  explicit INT8Vec16(const FP32Vec16& vec) {
+    __m256i lo_i32 = _mm256_cvtps_epi32(vec.reg_low);
+    __m256i hi_i32 = _mm256_cvtps_epi32(vec.reg_high);
+    __m256i packed16 = _mm256_packs_epi32(lo_i32, hi_i32);
+    packed16 = _mm256_permute4x64_epi64(packed16, 0xD8);
+    __m256i packed8 = _mm256_packs_epi16(packed16, _mm256_setzero_si256());
+    packed8 = _mm256_permute4x64_epi64(packed8, 0xD8);
+    reg = _mm256_castsi256_si128(packed8);
+  }
+
+  void save(int8_t* ptr) const { _mm_storeu_si128((__m128i*)ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    AliasReg ar;
+    ar.reg = reg;
+    for (int i = 0; i < elem_num; ++i) ptr[i] = ar.values[i];
+  }
+};
 #endif
 
 template <typename T>
diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp
index 80be42bb7639..058fe25b0e26 100644
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@@ -215,7 +215,7 @@ void dynamic_quant_epilogue(const float* input, scalar_t* output,
         float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
         token_zp_scale_vec = cvt_vec_t(zp_scale_val);
       }
-      for (; j < hidden_size - vec_elem_num; ++j) {
+      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
         cvt_vec_t elems_fp32(input_ptr + j);
         elems_fp32 = elems_fp32 * token_scale_vec;
         if constexpr (AZP) {
diff --git a/csrc/cpu/generate_cpu_attn_dispatch.py b/csrc/cpu/generate_cpu_attn_dispatch.py
index d915d9ce5997..2cfd599f09cd 100644
--- a/csrc/cpu/generate_cpu_attn_dispatch.py
+++ b/csrc/cpu/generate_cpu_attn_dispatch.py
@@ -20,6 +20,7 @@
     "VEC16": 2,
     "NEON": 3,
     "VXE": 4,
+    "VSX": 5,
 }
 
 # KV cache index: 0 = auto (same as scalar_t), 1 = fp8_e4m3, 2 = fp8_e5m2
@@ -37,7 +38,7 @@
 }
 
 # ISAs supported for head_dims divisible by 32
-ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16", "VXE"]
+ISA_FOR_32 = ["AMX", "NEON", "VEC", "VEC16", "VXE", "VSX"]
 
 # ISAs supported for head_dims divisible by 16 only
 ISA_FOR_16 = ["VEC16"]
@@ -148,6 +149,10 @@ def generate_header_file() -> str:
   #include "cpu_attn_vxe.hpp"
 #endif
 
+#ifdef __powerpc__
+  #include "cpu_attn_vsx.hpp"
+#endif
+
 """
 
     header += generate_helper_function()
@@ -207,6 +212,11 @@ def _macro_block(guard: str, isa_list: list[str], fp8: bool) -> str:
         ["VXE", "VEC", "VEC16"],
         fp8=False,
     )
+    header += _macro_block(
+        "#elif defined(__powerpc__)",
+        ["VSX", "VEC", "VEC16"],
+        fp8=False,
+    )
     header += _macro_block(
         "#elif defined(__AVX512F__)",
         ["VEC", "VEC16"],
@@ -223,7 +233,8 @@ def _macro_block(guard: str, isa_list: list[str], fp8: bool) -> str:
         fp8=False,
     )
     header += (
-        "#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / __s390x__ */\n\n"
+        "#endif  /* CPU_CAPABILITY_AMXBF16 / __aarch64__ / "
+        "__s390x__ / __powerpc__ */\n\n"
         "#endif  // CPU_ATTN_DISPATCH_GENERATED_H\n"
     )
 
diff --git a/csrc/cpu/sgl-kernels/common.h b/csrc/cpu/sgl-kernels/common.h
index 31be725fab29..425d5d131f63 100644
--- a/csrc/cpu/sgl-kernels/common.h
+++ b/csrc/cpu/sgl-kernels/common.h
@@ -1,13 +1,12 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
+// clang-format off
+
 #pragma once
 
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
-#include <ATen/record_function.h>
-
-// clang-format off
 
 #if defined(_OPENMP)
 #include <omp.h>
@@ -16,40 +15,157 @@
 namespace {
 
 // dispatch bool
-#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...)                                 \
-  [&] {                                                                          \
-    if (BOOL_V) {                                                                \
-      constexpr bool BOOL_NAME = true;                                           \
-      return __VA_ARGS__();                                                      \
-    } else {                                                                     \
-      constexpr bool BOOL_NAME = false;                                          \
-      return __VA_ARGS__();                                                      \
-    }                                                                            \
+#define AT_DISPATCH_BOOL(BOOL_V, BOOL_NAME, ...) \
+  [&] {                                          \
+    if (BOOL_V) {                                \
+      constexpr bool BOOL_NAME = true;           \
+      return __VA_ARGS__();                      \
+    } else {                                     \
+      constexpr bool BOOL_NAME = false;          \
+      return __VA_ARGS__();                      \
+    }                                            \
+  }()
+
+#define AT_DISPATCH_BOOL2(BOOL_V1, BOOL_NAME1, BOOL_V2, BOOL_NAME2, ...) \
+  [&] {                                                                  \
+    if (BOOL_V1) {                                                       \
+      constexpr bool BOOL_NAME1 = true;                                  \
+      if (BOOL_V2) {                                                     \
+        constexpr bool BOOL_NAME2 = true;                                \
+        return __VA_ARGS__();                                            \
+      } else {                                                           \
+        constexpr bool BOOL_NAME2 = false;                               \
+        return __VA_ARGS__();                                            \
+      }                                                                  \
+    } else {                                                             \
+      constexpr bool BOOL_NAME1 = false;                                 \
+      if (BOOL_V2) {                                                     \
+        constexpr bool BOOL_NAME2 = true;                                \
+        return __VA_ARGS__();                                            \
+      } else {                                                           \
+        constexpr bool BOOL_NAME2 = false;                               \
+        return __VA_ARGS__();                                            \
+      }                                                                  \
+    }                                                                    \
+  }()
+
+// dispatch: bfloat16, float16, int8_t, fp8_e4m3, uint8_t(mxfp4/int4)
+#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                     \
+  [&] {                                                          \
+    switch (TYPE) {                                              \
+      case at::ScalarType::BFloat16: {                           \
+        using packed_t = at::BFloat16;                           \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Half: {                               \
+        using packed_t = at::Half;                               \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Char: {                               \
+        using packed_t = int8_t;                                 \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Float8_e4m3fn: {                      \
+        using packed_t = at::Float8_e4m3fn;                      \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      case at::ScalarType::Byte: {                               \
+        using packed_t = uint8_t;                                \
+        return __VA_ARGS__();                                    \
+      }                                                          \
+      default:                                                   \
+        TORCH_CHECK(false, "Unsupported floating data type.\n"); \
+    }                                                            \
+  }()
+
+// Helper MICRO for CPU_DISPATCH_FLOATING_TYPES_EXT:
+//   TYPE1: the primary dtype (input, output, weight);
+//   TYPE2: defined as PARAM_T input
+#define CPU_DISPATCH_TYPE1_WITH_PARAM(TYPE1, PARAM_T, ...)   \
+  switch (TYPE1) {                                           \
+    case at::ScalarType::BFloat16: {                         \
+      using scalar_t = at::BFloat16;                         \
+      using param_t = PARAM_T;                               \
+      return __VA_ARGS__();                                  \
+    }                                                        \
+    case at::ScalarType::Half: {                             \
+      using scalar_t = at::Half;                             \
+      using param_t = PARAM_T;                               \
+      return __VA_ARGS__();                                  \
+    }                                                        \
+    case at::ScalarType::Float: {                            \
+      using scalar_t = float;                                \
+      using param_t = PARAM_T;                               \
+      return __VA_ARGS__();                                  \
+    }                                                        \
+    default:                                                 \
+      TORCH_CHECK(false, "Unsupported floating data type."); \
+  }
+
+// Helper MICRO for CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT:
+//   TYPE1: the primary dtype (input, output, weight);
+//   TYPE2: defined as PARAM_T input
+#define CPU_DISPATCH_TYPE1_WITH_PARAM_REDUCED(TYPE1, PARAM_T, ...) \
+  switch (TYPE1) {                                                 \
+    case at::ScalarType::BFloat16: {                               \
+      using scalar_t = at::BFloat16;                               \
+      using param_t = PARAM_T;                                     \
+      return __VA_ARGS__();                                        \
+    }                                                              \
+    case at::ScalarType::Half: {                                   \
+      using scalar_t = at::Half;                                   \
+      using param_t = PARAM_T;                                     \
+      return __VA_ARGS__();                                        \
+    }                                                              \
+    default:                                                       \
+      TORCH_CHECK(false, "Unsupported floating data type.");       \
+  }
+
+// Helper MICRO for CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT:
+//   TYPE1: the dtype both for scalar_t and param_t
+#define CPU_DISPATCH_TYPE1_WITH_SAME_PARAM_REDUCED(TYPE1, ...)       \
+  switch (TYPE1) {                                                   \
+    case at::ScalarType::BFloat16: {                                 \
+      using scalar_t = at::BFloat16;                                 \
+      using param_t = at::BFloat16;                                  \
+      return __VA_ARGS__();                                          \
+    }                                                                \
+    case at::ScalarType::Half: {                                     \
+      using scalar_t = at::Half;                                     \
+      using param_t = at::Half;                                      \
+      return __VA_ARGS__();                                          \
+    }                                                                \
+    default:                                                         \
+      TORCH_CHECK(false, "Unsupported reduced floating data type."); \
+  }
+
+// dispatch with mixed dtypes (TYPE1, TYPE2):
+//   TYPE1: the primary dtype (input, output, weight);
+//   TYPE2: the secondary dtype (bias, etc.).
+#define CPU_DISPATCH_FLOATING_TYPES_EXT(TYPE1, TYPE2, ...)            \
+  [&] {                                                               \
+    if (TYPE2 == at::kFloat) {                                        \
+      CPU_DISPATCH_TYPE1_WITH_PARAM(TYPE1, float, __VA_ARGS__)        \
+    } else if (TYPE2 == at::ScalarType::BFloat16) {                   \
+      CPU_DISPATCH_TYPE1_WITH_PARAM(TYPE1, at::BFloat16, __VA_ARGS__) \
+    } else if (TYPE2 == at::ScalarType::Half) {                       \
+      CPU_DISPATCH_TYPE1_WITH_PARAM(TYPE1, at::Half, __VA_ARGS__)     \
+    } else {                                                          \
+      TORCH_CHECK(false, "Unsupported floating data type.");          \
+    }                                                                 \
   }()
 
-// dispatch: bfloat16, float16, int8_t, fp8_e4m3
-#define CPU_DISPATCH_PACKED_TYPES(TYPE, ...)                                    \
-  [&] {                                                                         \
-    switch (TYPE) {                                                             \
-      case at::ScalarType::BFloat16 : {                                         \
-        using packed_t = at::BFloat16;                                          \
-        return __VA_ARGS__();                                                   \
-      }                                                                         \
-      case at::ScalarType::Half: {                                              \
-        using packed_t = at::Half;                                              \
-        return __VA_ARGS__();                                                   \
-      }                                                                         \
-      case at::ScalarType::Char : {                                             \
-        using packed_t = int8_t;                                                \
-        return __VA_ARGS__();                                                   \
-      }                                                                         \
-      case at::ScalarType::Float8_e4m3fn : {                                    \
-        using packed_t = at::Float8_e4m3fn;                                     \
-        return __VA_ARGS__();                                                   \
-      }                                                                         \
-      default:                                                                  \
-        TORCH_CHECK(false, "Unsupported floating data type.\n");                \
-    }                                                                           \
+// dispatch with mixed dtypes (reduced one, no float for TYPE1) (TYPE1, TYPE2):
+//   TYPE1: the primary dtype (input, output, weight);
+//   TYPE2: the secondary dtype (bias, etc.).
+#define CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT(TYPE1, TYPE2, ...)     \
+  [&] {                                                                \
+    if (TYPE2 == at::kFloat) {                                         \
+      CPU_DISPATCH_TYPE1_WITH_PARAM_REDUCED(TYPE1, float, __VA_ARGS__) \
+    } else {                                                           \
+      TORCH_CHECK(TYPE1 == TYPE2);                                     \
+      CPU_DISPATCH_TYPE1_WITH_SAME_PARAM_REDUCED(TYPE1, __VA_ARGS__)   \
+    }                                                                  \
   }()
 
 #define UNUSED(x) (void)(x)
@@ -70,13 +186,51 @@ namespace {
 #define CHECK_DIM(d, x) TORCH_CHECK(x.dim() == d, #x " must be a " #d "D tensor")
 
 #define CHECK_EQ(a, b) TORCH_CHECK((a) == (b), "CHECK_EQ(" #a ", " #b ") failed. ", a, " vs ", b)
+#define CHECK_GT(a, b) TORCH_CHECK((a) > (b), "CHECK_GT(" #a ", " #b ") failed. ", a, " vs ", b)
+#define CHECK_GE(a, b) TORCH_CHECK((a) >= (b), "CHECK_GE(" #a ", " #b ") failed. ", a, " vs ", b)
+
+template <bool is_only_lastdim_contiguous>
+static inline void CHECK_INPUT_SHAPE_DTYPE(const at::Tensor& tensor, const at::IntArrayRef sizes, at::ScalarType st) {
+  TORCH_CHECK(tensor.sizes() == sizes, "Input tensor shape mismatch: expected ", sizes, ", got ", tensor.sizes());
+  TORCH_CHECK(tensor.scalar_type() == st, "Input tensor dtype mismatch");
+  if constexpr (is_only_lastdim_contiguous) {
+    CHECK_LAST_DIM_CONTIGUOUS_INPUT(tensor);
+  } else {
+    CHECK_INPUT(tensor);
+  }
+}
 
-// parallel routines
+// [NB] Parallel Routines
+//
+//  * at::parallel_for - applies for most of generic use cases, this will be compiled
+//                       against openmp in default torch release.
+//
+//  * parallel_for     - same function as above, can choose payload partition scheme in
+//                       balance211.
+//
+//  * parallel_2d      - parallel for 2 dimensions, used in GEMM, etc.
+//                       this one will do payload balance across 2 dimensions.
+//
+
+// grain size for each thread
 constexpr int GRAIN_SIZE = 1024;
 
 template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
-inline T div_up(T x, T y) { return (x + y - 1) / y; }
+inline T div_up(T x, T y) {
+  return (x + y - 1) / y;
+}
+
+// you can only use at::get_thread_num() with at::parallel_for()
+// as it is lazy initialized, otherwise it will always return 0.
+inline int get_thread_num() {
+#if defined(_OPENMP)
+  return omp_get_thread_num();
+#else
+  return 0;
+#endif
+}
 
+// balance payload across each thread
 template <typename T>
 inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
 #if 0
@@ -94,10 +248,10 @@ inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
     }
     n_end += n_start;
 #else
-    // pytorch aten partition pattern
-    T n_my = div_up(n, nth);
-    n_start = ith * n_my;
-    n_end = std::min(n_start + n_my, n);
+  // pytorch aten partition pattern
+  T n_my = div_up(n, nth);
+  n_start = ith * n_my;
+  n_end = std::min(n_start + n_my, n);
 #endif
 }
 
@@ -105,23 +259,15 @@ template <typename func_t>
 inline void parallel_for(int n, const func_t& f) {
 #if defined(_OPENMP)
 #pragma omp parallel
-{
+  {
     int nth = omp_get_num_threads();
     int ith = omp_get_thread_num();
     int tbegin, tend;
     balance211(n, nth, ith, tbegin, tend);
     f(tbegin, tend);
-}
-#else
-    f(0, n);
-#endif
-}
-
-inline int get_thread_num() {
-#if defined(_OPENMP)
-  return omp_get_thread_num();
+  }
 #else
-  return 0;
+  f(0, n);
 #endif
 }
 
@@ -137,7 +283,6 @@ int inline adjust_num_threads(int m) {
 
 template <typename func_t>
 inline void parallel_2d(int m, int n, const func_t& f) {
-
   // make sure we have even num_threads
   int nth = adjust_num_threads(m);
 
@@ -165,31 +310,59 @@ inline void parallel_2d(int m, int n, const func_t& f) {
 
 #if defined(_OPENMP)
 #pragma omp parallel num_threads(nth)
-{
-  int ith = omp_get_thread_num();
-  int ith_m = ith / nth_n;
-  int ith_n = ith % nth_n;
+  {
+    int ith = omp_get_thread_num();
+    int ith_m = ith / nth_n;
+    int ith_n = ith % nth_n;
 
-  int thread_block_m = div_up(m, nth_m);
-  int thread_block_n = div_up(n, nth_n);
+    int thread_block_m = div_up(m, nth_m);
+    int thread_block_n = div_up(n, nth_n);
 
-  int begin_m = ith_m * thread_block_m;
-  int end_m = std::min(m, begin_m + thread_block_m);
-  int begin_n = ith_n * thread_block_n;
-  int end_n = std::min(n, begin_n + thread_block_n);
+    int begin_m = ith_m * thread_block_m;
+    int end_m = std::min(m, begin_m + thread_block_m);
+    int begin_n = ith_n * thread_block_n;
+    int end_n = std::min(n, begin_n + thread_block_n);
 
-  f(begin_m, end_m, begin_n, end_n);
-}
+    f(begin_m, end_m, begin_n, end_n);
+  }
 #else
   f(0, m, 0, n);
 #endif
 }
 
+// limit max cache blocks
+// when we need to do pre-unpack for weights, e.g. fp8
+#define MAX_CACHE_BLOCK_SIZE 4
+
 template <typename T>
-int get_cache_blocks(int BLOCK_SIZE, int K) {
+inline int get_cache_blocks(int chunk_size) {
   // L2 2MB and ratio of 50%
   const int L2_size = 2048 * 1024 >> 1;
-  return std::max(1, int(L2_size / (BLOCK_SIZE * K * sizeof(T))));
+  return std::max(1, int(L2_size / (chunk_size * sizeof(T))));
+}
+
+template <>
+inline int get_cache_blocks<at::Float8_e4m3fn>(int chunk_size) {
+  // fp8 uses bf16 as accumulate type
+  int cache_block_size = get_cache_blocks<at::BFloat16>(chunk_size);
+  return std::min(MAX_CACHE_BLOCK_SIZE, cache_block_size);
+}
+
+// 2d sequential loop in range : [mb0, mb1), [nb0, nb1)
+template <typename T, typename func_t>
+inline void loop_2d(int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1, int64_t chunk_size, const func_t& f) {
+  // get number of blocks for L2 in most inner loop
+  int64_t cache_blocks_nb = get_cache_blocks<T>(chunk_size);
+
+  // loop order: [NB / cache_blocks_nb, MB, cache_blocks_nb]
+  // TODO: implement reverse order of [MB / cache_blocks_mb, NB, cache_blocks_mb]
+  for (int64_t nbb = nb0; nbb < nb1; nbb += cache_blocks_nb) {
+    for (int64_t mb = mb0; mb < mb1; ++mb) {
+      for (int64_t nb = nbb; nb < std::min(nbb + cache_blocks_nb, nb1); ++nb) {
+        f(mb, nb, nb - nbb);
+      }
+    }
+  }
 }
 
 // data indexing for dimension collapse
@@ -243,4 +416,10 @@ struct Unroll<1> {
   }
 };
 
-} // anonymous namespace
+// conditional data ptr for optional tensor
+template <typename T>
+inline T* conditional_data_ptr(const std::optional<at::Tensor>& opt) {
+  return opt.has_value() ? opt.value().data_ptr<T>() : nullptr;
+}
+
+}  // anonymous namespace
diff --git a/csrc/cpu/sgl-kernels/conv.cpp b/csrc/cpu/sgl-kernels/conv.cpp
new file mode 100644
index 000000000000..10ed1f85fb77
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/conv.cpp
@@ -0,0 +1,709 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
+
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+
+namespace {
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ y, const scalar_t* __restrict__ x, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  const bool is_padding = (x == nullptr);
+  for (int64_t d = 0; d < size; d += Vec::size()) {
+    Vec data_vec = is_padding ? Vec(0.f) : Vec::loadu(x + d);
+    data_vec.store(y + d);
+  }
+}
+
+// no remainder
+template <typename scalar_t>
+void inline update_conv_state(
+    scalar_t* __restrict__ conv_states,
+    const scalar_t* __restrict__ input,
+    int64_t width,
+    int64_t dim,
+    int64_t seqlen,
+    bool has_initial_states) {
+  // width for `conv_states`
+  int64_t width1 = width - 1;
+  int64_t w = 0;
+  for (; w < width1 - seqlen; ++w) {
+    scalar_t* y = conv_states + w * dim;
+    const scalar_t* x = has_initial_states ? conv_states + (w + seqlen) * dim : nullptr;
+    copy_stub(y, x, dim);
+  }
+  for (; w < width1; ++w) {
+    scalar_t* y = conv_states + w * dim;
+    const scalar_t* x = input + (w + seqlen - width1) * dim;
+    copy_stub(y, x, dim);
+  }
+}
+
+// A : [M, BLOCK_N]
+// B : [BLOCK_N, K], prepacked as [K/2, BLOCK_N, 2]
+// C : [M, BLOCK_N]
+// bias : [BLOCK_N]
+//
+// lda : leading dimension of `input` and `out`
+//
+template <typename scalar_t, int K, int BLOCK_N, bool has_bias, bool has_silu>
+struct tinygemm_kernel {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const scalar_t* __restrict__ bias,
+      const scalar_t* __restrict__ conv_states,
+      bool has_initial_state,
+      int64_t M,
+      int64_t lda,
+      bool is_first_token) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <int K, int BLOCK_N, bool has_bias, bool has_silu>
+struct tinygemm_kernel<at::BFloat16, K, BLOCK_N, has_bias, has_silu> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const at::BFloat16* __restrict__ bias,
+      const at::BFloat16* __restrict__ conv_states,
+      bool has_initial_state,
+      int64_t M,
+      int64_t lda,
+      bool is_first_token) {
+    assert(K == 4);
+    constexpr int ROWS = K;
+    constexpr int COLS = BLOCK_N / block_size_n();
+
+    // leading dimension size for b for next block [K/2, 32, 2]
+    constexpr int ldb = block_size_n() * K;
+
+    __m512bh va[ROWS * COLS];
+    __m512bh vb[ROWS * COLS];
+    __m512 vc[COLS * 2];
+
+    // k: {-3, -2, -1} -> {0, 1, 2}
+    auto set_conv_states = [&](int k, int col) -> __m512i {
+      return has_initial_state ? _mm512_loadu_si512(conv_states + (k + K - 1) * lda + col * 32)
+                               : _mm512_setzero_si512();
+    };
+
+#define MM512_LOAD_A(idx)                                                 \
+  ((idx) < 0 && is_first_token) ? (__m512bh)(set_conv_states((idx), col)) \
+                                : (__m512bh)(_mm512_loadu_si512(A + (idx) * lda + col * 32))
+
+#define MM512_PACK_A(ap, bp, a, b)                       \
+  do {                                                   \
+    __m512i r0 = (__m512i)(a);                           \
+    __m512i r1 = (__m512i)(b);                           \
+    __m512i d0 = _mm512_unpacklo_epi16(r0, r1);          \
+    __m512i d1 = _mm512_unpackhi_epi16(r0, r1);          \
+    r0 = _mm512_shuffle_i32x4(d0, d1, 0x88);             \
+    r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd);             \
+    (ap) = (__m512bh)_mm512_shuffle_i32x4(r0, r1, 0x88); \
+    (bp) = (__m512bh)_mm512_shuffle_i32x4(r0, r1, 0xdd); \
+  } while (0)
+
+    // step 0 : preload a at time step [-3][-2][-1]
+    auto preloada = [&](auto i) {
+      constexpr int col = i;
+      int64_t m = 0;
+      va[1 * COLS + col] = MM512_LOAD_A(m - 3);
+      va[2 * COLS + col] = MM512_LOAD_A(m - 2);
+      va[3 * COLS + col] = MM512_LOAD_A(m - 1);
+    };
+    Unroll<COLS>{}(preloada);
+
+    auto loada = [&](auto i, int64_t m) {
+      constexpr int col = i;
+      // update previous time step
+      va[0 * COLS + col] = va[1 * COLS + col];
+      va[1 * COLS + col] = va[2 * COLS + col];
+      va[2 * COLS + col] = va[3 * COLS + col];
+      // load current time step
+      va[3 * COLS + col] = MM512_LOAD_A(m);
+    };
+
+    // step 1 : load weight for just once
+    auto loadb = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      vb[row * COLS + col] = (__m512bh)(_mm512_loadu_si512(B + col * ldb + row * 32));
+    };
+    Unroll<ROWS * COLS>{}(loadb);
+
+    // [NB] accumulates 4x32 bfloat16 blocks
+    //
+    //   +------------+------------+
+    //   |    col0    |    col1    |
+    //   +------------+------------+
+    //   |  va0  va1  |  va0  va1  |
+    //   |  va2  va3  |  va2  va3  |
+    //   +------------+------------+
+    //   |  vc0  vc1  |  vc0  vc1  |
+    //   +------------+------------+
+    //
+    //  * va and vb shares the same memory layout
+    //  * block_n 32 with 4 rows equals to 4 registers
+    //  * 37 uops with avx512bf16 v.s. 57 uops with avx512f
+    //
+    auto compute = [&](auto i) {
+      constexpr int col = i;
+
+      // init accumulators
+      if constexpr (has_bias) {
+        __m512i b16 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(bias + col * 32));
+        vc[col * 2 + 0] = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(b16, 0));
+        vc[col * 2 + 1] = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32(b16, 1));
+      } else {
+        vc[col * 2 + 0] = _mm512_set1_ps(0.f);
+        vc[col * 2 + 1] = _mm512_set1_ps(0.f);
+      }
+
+      // convert to vnni2 format
+      __m512bh va0, va1, va2, va3;
+      MM512_PACK_A(va0, va1, va[0 * COLS + col], va[1 * COLS + col]);
+      MM512_PACK_A(va2, va3, va[2 * COLS + col], va[3 * COLS + col]);
+
+      // accumulate
+      vc[col * 2 + 0] = _mm512_dpbf16_ps(vc[col * 2 + 0], va0, vb[0 * COLS + col]);
+      vc[col * 2 + 0] = _mm512_dpbf16_ps(vc[col * 2 + 0], va2, vb[2 * COLS + col]);
+      vc[col * 2 + 1] = _mm512_dpbf16_ps(vc[col * 2 + 1], va1, vb[1 * COLS + col]);
+      vc[col * 2 + 1] = _mm512_dpbf16_ps(vc[col * 2 + 1], va3, vb[3 * COLS + col]);
+    };
+
+    using fVec = at::vec::Vectorized<float>;
+    using bVec = at::vec::Vectorized<at::BFloat16>;
+    const fVec one = fVec(1.f);
+    auto storec = [&](auto i, int64_t m) {
+      constexpr int col = i;
+      fVec x0 = fVec(vc[col * 2 + 0]);
+      fVec x1 = fVec(vc[col * 2 + 1]);
+      if constexpr (has_silu) {
+        x0 = x0 / (one + x0.neg().exp_u20());
+        x1 = x1 / (one + x1.neg().exp_u20());
+      }
+      bVec out_vec = convert_from_float_ext<at::BFloat16>(x0, x1);
+      out_vec.store(C + m * lda + col * 32);
+    };
+
+    for (int64_t m = 0; m < M; ++m) {
+      // step 3.a : load a at current time step
+      Unroll<COLS>{}(loada, m);
+      // step 3.b : accumulate for window size (4)
+      Unroll<COLS>{}(compute);
+      // step 3.c : store c at current time step
+      Unroll<COLS>{}(storec, m);
+    }
+  }
+};
+#endif
+
+#define LAUNCH_TINYGEMM_KERNEL(K, NB_SIZE)                                                   \
+  tinygemm_kernel<scalar_t, K, NB_SIZE, has_bias, has_silu>::apply(                          \
+      input + bs * seqlen * dim + mb_start * dim + nb_start,                                 \
+      weight + nb_start * width,                                                             \
+      out + bs * seqlen * dim + mb_start * dim + nb_start,                                   \
+      has_bias ? bias + nb_start : nullptr,                                                  \
+      has_conv_states ? conv_states + conv_state_index * (K - 1) * dim + nb_start : nullptr, \
+      has_initial_states_value,                                                              \
+      mb_size,                                                                               \
+      dim,                                                                                   \
+      mb_start == 0);
+
+template <typename scalar_t>
+void causal_conv1d_fwd_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ weight,
+    const scalar_t* __restrict__ bias,
+    scalar_t* __restrict__ conv_states,
+    const int32_t* __restrict__ conv_indices,
+    const bool* __restrict__ has_initial_state,
+    bool silu_activation,
+    int64_t batch,
+    int64_t dim,
+    int64_t seqlen,
+    int64_t width,
+    int64_t num_seq_blocks) {
+  // handle 32 x 64 per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n() * 2;
+  const int64_t NB = div_up(dim, BLOCK_N);
+
+  const int64_t num_blocks_per_seq = div_up(seqlen, BLOCK_M);
+  const bool has_conv_states = conv_states != nullptr;
+  const bool has_conv_indices = conv_indices != nullptr;
+
+  // parallel on [batch, seq, NB]
+  AT_DISPATCH_BOOL2(bias != nullptr, has_bias, silu_activation, has_silu, [&] {
+    at::parallel_for(0, num_seq_blocks * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, num_seq_blocks, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        int64_t bs = mb / num_blocks_per_seq;
+
+        int64_t mb_start = (mb % num_blocks_per_seq) * BLOCK_M;
+        int64_t mb_size = std::min(seqlen - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(dim - nb_start, BLOCK_N);
+
+        const bool has_initial_states_value = has_conv_states ? has_initial_state[bs] : false;
+        int32_t conv_state_index = has_conv_indices ? conv_indices[bs] : bs;
+
+        switch (width << 4 | nb_size >> 4) {
+          case 0x42:
+            LAUNCH_TINYGEMM_KERNEL(4, 32);
+            break;
+          case 0x44:
+            LAUNCH_TINYGEMM_KERNEL(4, 64);
+            break;
+          default:
+            TORCH_CHECK(false, "Unexpected block size, ", width, " x ", nb_size);
+        }
+
+        // move to the next index
+        data_index_step(mb, num_seq_blocks, nb, NB);
+      }
+    });
+  });
+
+  // update conv_states if necessary
+  if (has_conv_states) {
+    at::parallel_for(0, batch, 0, [&](int64_t begin, int64_t end) {
+      for (int64_t bs = begin; bs < end; ++bs) {
+        update_conv_state(
+            conv_states + bs * (width - 1) * dim, input + bs * seqlen * dim, width, dim, seqlen, has_initial_state[bs]);
+      }
+    });
+  }
+}
+
+#define LAUNCH_TINYGEMM_VARLEN_KERNEL(K, NB_SIZE)                   \
+  tinygemm_kernel<scalar_t, K, NB_SIZE, has_bias, has_silu>::apply( \
+      input + batch_offset * dim + mb_start * dim + nb_start,       \
+      weight + nb_start * width,                                    \
+      out + batch_offset * dim + mb_start * dim + nb_start,         \
+      has_bias ? bias + nb_start : nullptr,                         \
+      nullptr,                                                      \
+      false,                                                        \
+      mb_size,                                                      \
+      dim,                                                          \
+      mb_start == 0);
+
+// TODO: add `has_initial_state` support for varlen kernel
+template <typename scalar_t>
+void causal_conv1d_fwd_varlen_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    const scalar_t* __restrict__ weight,
+    const scalar_t* __restrict__ bias,
+    scalar_t* __restrict__ conv_states,
+    const int32_t* __restrict__ query_start_loc,
+    const int32_t* __restrict__ conv_indices,
+    const bool* __restrict__ has_initial_state,
+    const int32_t* __restrict__ block_indices,
+    bool silu_activation,
+    int64_t batch,
+    int64_t dim,
+    int64_t width,
+    int64_t num_seq_blocks) {
+  // handle 32 x 64 per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n() * 2;
+  const int64_t NB = div_up(dim, BLOCK_N);
+
+  const bool has_conv_states = conv_states != nullptr;
+  const bool has_conv_indices = conv_indices != nullptr;
+
+  // parallel on [batch, seq, NB]
+  AT_DISPATCH_BOOL2(bias != nullptr, has_bias, silu_activation, has_silu, [&] {
+    at::parallel_for(0, num_seq_blocks * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t mb{0}, nb{0};
+      data_index_init(begin, mb, num_seq_blocks, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        int32_t bs = block_indices[mb * 2 + 0];
+        int32_t batch_offset = query_start_loc[bs];
+        int32_t seqlen = query_start_loc[bs + 1] - query_start_loc[bs];
+
+        int64_t mb_start = block_indices[mb * 2 + 1] * BLOCK_M;
+        int64_t mb_size = std::min(seqlen - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(dim - nb_start, BLOCK_N);
+
+        switch (width << 4 | nb_size >> 4) {
+          case 0x42:
+            LAUNCH_TINYGEMM_VARLEN_KERNEL(4, 32);
+            break;
+          case 0x44:
+            LAUNCH_TINYGEMM_VARLEN_KERNEL(4, 64);
+            break;
+          default:
+            TORCH_CHECK(false, "Unexpected block size, ", width, " x ", nb_size);
+        }
+
+        // move to the next index
+        data_index_step(mb, num_seq_blocks, nb, NB);
+      }
+    });
+  });
+
+  // update conv_states if necessary
+  if (has_conv_states) {
+    at::parallel_for(0, batch, 0, [&](int64_t begin, int64_t end) {
+      for (int64_t bs = begin; bs < end; ++bs) {
+        int32_t conv_state_index = has_conv_indices ? conv_indices[bs] : bs;
+        int32_t seqlen = query_start_loc[bs + 1] - query_start_loc[bs];
+        int32_t batch_offset = query_start_loc[bs];
+        update_conv_state(
+            conv_states + conv_state_index * (width - 1) * dim,
+            input + batch_offset * dim,
+            width,
+            dim,
+            seqlen,
+            /* has_initial_state */ false);
+      }
+    });
+  }
+}
+
+template <typename scalar_t>
+void causal_conv1d_update_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ input,
+    scalar_t* __restrict__ conv_states,
+    const scalar_t* __restrict__ weight,
+    const scalar_t* __restrict__ bias,
+    const int32_t* __restrict__ conv_indices,
+    bool silu_activation,
+    int64_t batch,
+    int64_t dim,
+    int64_t seqlen,
+    int64_t width) {
+  // handle 32 x 64 per block
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n() * 2;
+  const int64_t NB = div_up(dim, BLOCK_N);
+
+  const bool has_conv_states = conv_states != nullptr;
+  const bool has_conv_indices = conv_indices != nullptr;
+
+  // parallel on [batch, NB]
+  AT_DISPATCH_BOOL2(bias != nullptr, has_bias, silu_activation, has_silu, [&] {
+    at::parallel_for(0, batch * NB, 0, [&](int64_t begin, int64_t end) {
+      int64_t bs{0}, nb{0};
+      data_index_init(begin, bs, batch, nb, NB);
+
+      for (int64_t i = begin; i < end; ++i) {
+        int64_t mb_start = 0;
+        int64_t mb_size = 1;
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(dim - nb_start, BLOCK_N);
+
+        const bool has_initial_states_value = true;
+        int32_t conv_state_index = has_conv_indices ? conv_indices[bs] : bs;
+
+        switch (width << 4 | nb_size >> 4) {
+          case 0x42:
+            LAUNCH_TINYGEMM_KERNEL(4, 32);
+            break;
+          case 0x44:
+            LAUNCH_TINYGEMM_KERNEL(4, 64);
+            break;
+          default:
+            TORCH_CHECK(false, "Unexpected block size, ", width, " x ", nb_size);
+        }
+
+        // move to the next index
+        data_index_step(bs, batch, nb, NB);
+      }
+    });
+  });
+
+#define CONV_STATE_INDEXR(w) conv_states + conv_state_index*(width - 1) * dim + (w) * dim
+
+  // update conv_states
+  at::parallel_for(0, batch, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t bs = begin; bs < end; ++bs) {
+      // update old states, range [1, width - 1)
+      int32_t conv_state_index = has_conv_indices ? conv_indices[bs] : bs;
+      for (int64_t w = 1; w < width - 1; ++w) {
+        std::memcpy(CONV_STATE_INDEXR(w - 1), CONV_STATE_INDEXR(w), dim * sizeof(scalar_t));
+      }
+      // copy new states
+      std::memcpy(CONV_STATE_INDEXR(width - 2), input + bs * dim, dim * sizeof(scalar_t));
+    }
+  });
+}
+
+}  // anonymous namespace
+
+// from [dim, width] or [N, K]
+// to [N/BLOCK_N, K/2, BLOCK_N, 2]
+at::Tensor causal_conv1d_weight_pack(const at::Tensor& weight) {
+  CHECK_INPUT(weight);
+
+  int64_t dim = weight.size(0);
+  int64_t width = weight.size(1);
+  constexpr int64_t BLOCK_N = block_size_n();
+  TORCH_CHECK(width == 4, "causal_conv1d_weight_pack: support only width of 4");
+  TORCH_CHECK(dim % BLOCK_N == 0, "causal_conv1d_weight_pack: invalid dim size ", dim);
+
+  const int64_t N = dim, K2 = width >> 1;
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  auto packed_weight = at::empty_like(weight);
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(weight.scalar_type(), "causal_conv1d_fwd_kernel_impl", [&] {
+    // cast to float32 as vnni size is 2
+    const float* w_data = reinterpret_cast<float*>(weight.data_ptr<scalar_t>());
+    float* packed_data = reinterpret_cast<float*>(packed_weight.data_ptr<scalar_t>());
+
+    at::parallel_for(0, NB * K2 * BLOCK_N, 0, [&](int64_t begin, int64_t end) {
+      int64_t nb{0}, k2{0}, n{0};
+      data_index_init(begin, nb, NB, k2, K2, n, BLOCK_N);
+
+      // TODO: optimize this if we need to online prepacking.
+      for (int64_t i = begin; i < end; ++i) {
+        packed_data[i] = w_data[nb * BLOCK_N * K2 + n * K2 + k2];
+
+        // move to the next index
+        data_index_step(nb, NB, k2, K2, n, BLOCK_N);
+      }
+    });
+  });
+  return packed_weight;
+}
+
+#define CHECK_OPTIONAL_SHAPE_DTYPE(OPT, SIZE, DTYPE) \
+  if (OPT.has_value()) {                             \
+    const auto tensor = OPT.value();                 \
+    CHECK_CONTIGUOUS(tensor);                        \
+    CHECK_EQ(tensor.size(0), SIZE);                  \
+    CHECK_EQ(tensor.scalar_type(), DTYPE);           \
+  }
+
+template <int BLOCK_M>
+int64_t get_block_count(const std::optional<at::Tensor>& offsets, int64_t batch, int64_t seqlen) {
+  if (offsets.has_value()) {
+    const int32_t* offsets_data = offsets.value().data_ptr<int32_t>();
+    int32_t num_seq_blocks = 0;
+    for (int64_t row = 0; row < batch; ++row) {
+      num_seq_blocks += div_up(offsets_data[row + 1] - offsets_data[row], BLOCK_M);
+    }
+    return num_seq_blocks;
+  }
+  return batch * div_up(seqlen, int64_t(BLOCK_M));
+}
+
+template <int BLOCK_M>
+at::Tensor get_block_indices(const std::optional<at::Tensor>& offsets, int64_t num_seq_blocks) {
+  if (!offsets.has_value()) {
+    return at::Tensor();
+  }
+
+  const at::Tensor& offsets_ = offsets.value();
+  at::Tensor indices = at::empty({num_seq_blocks, 2}, offsets_.options());
+
+  int64_t batch = offsets_.size(0) - 1;
+
+  const int32_t* offsets_data = offsets_.data_ptr<int32_t>();
+  int32_t* indices_data = indices.data_ptr<int32_t>();
+
+  int64_t idx = 0;
+  for (int32_t row = 0; row < batch; ++row) {
+    int32_t blocks = div_up(offsets_data[row + 1] - offsets_data[row], BLOCK_M);
+
+    for (int32_t col = 0; col < blocks; ++col) {
+      indices_data[idx * 2 + 0] = row;
+      indices_data[idx * 2 + 1] = col;
+      idx++;
+    }
+  }
+  return indices;
+}
+
+// API aligned with GPUs
+//
+//   x: (batch, dim, seqlen) or (dim, cu_seq_len) for varlen
+//   weight: (dim, width)
+//   bias: (dim,)
+//   query_start_loc: (batch + 1) int32
+//   cache_indices: (batch)  int32
+//   has_initial_state: (batch) bool
+//   conv_states: (..., dim, width - 1) itype
+//   activation: either None or "silu" or "swish"
+//   pad_slot_id: int
+//
+at::Tensor causal_conv1d_fwd_cpu(
+    const at::Tensor& x,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias,
+    const std::optional<at::Tensor>& conv_states,
+    const std::optional<at::Tensor>& query_start_loc,
+    const std::optional<at::Tensor>& conv_state_indices,
+    const std::optional<at::Tensor>& has_initial_state,
+    bool silu_activation,
+    int64_t pad_slot_id,
+    bool is_vnni) {
+  CHECK_CONTIGUOUS(weight);
+  auto packed_w = is_vnni ? weight : causal_conv1d_weight_pack(weight);
+
+  const bool is_var_seqlen = query_start_loc.has_value();
+  const int64_t input_ndim = is_var_seqlen ? 2 : 3;
+  TORCH_CHECK(x.dim() == input_ndim, "causal_conv1d_fwd_cpu: expect x to be ", input_ndim, "D tensor.");
+  TORCH_CHECK(x.stride(-2) == 1 && x.stride(-1) == x.size(-2), "causal_conv1d_fwd_cpu: expect x to be transposed.");
+
+  const int64_t batch = is_var_seqlen ? query_start_loc.value().size(0) - 1 : x.size(0);
+  const int64_t dim = x.size(-2);
+  const int64_t seqlen = x.size(-1);
+  const int64_t width = weight.size(-1);
+
+  const auto scalar_type = x.scalar_type();
+  CHECK_EQ(weight.scalar_type(), scalar_type);
+  CHECK_OPTIONAL_SHAPE_DTYPE(bias, dim, scalar_type);
+  CHECK_OPTIONAL_SHAPE_DTYPE(query_start_loc, batch + 1, at::kInt);
+  CHECK_OPTIONAL_SHAPE_DTYPE(conv_state_indices, batch, at::kInt);
+  CHECK_OPTIONAL_SHAPE_DTYPE(has_initial_state, batch, at::kBool);
+
+  if (conv_states.has_value()) {
+    auto& conv_states_val = conv_states.value();
+    int64_t padded_batch = conv_states_val.size(0);
+    CHECK_EQ(conv_states_val.scalar_type(), scalar_type);
+    CHECK_GE(padded_batch, batch);
+    CHECK_EQ(conv_states_val.size(1), dim);
+    CHECK_EQ(conv_states_val.size(2), width - 1);
+
+    // adjust `conv_states` to be contiguous on `dim`
+    // should happen only once
+    if (conv_states_val.stride(-2) != 1) {
+      auto conv_states_copy = conv_states_val.clone();
+      conv_states_val.as_strided_({padded_batch, dim, width - 1}, {(width - 1) * dim, 1, dim});
+      conv_states_val.copy_(conv_states_copy);
+    }
+  }
+
+  // block size for sequence blocks, 32
+  constexpr int64_t BLOCK_M = block_size_m();
+
+  // total number of sequence blocks
+  int64_t num_seq_blocks = get_block_count<BLOCK_M>(query_start_loc, batch, seqlen);
+
+  at::Tensor out = at::empty_like(x);
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(scalar_type, "causal_conv1d_fwd_kernel_impl", [&] {
+    if (is_var_seqlen) {
+      // record seq blocks in Coordinate format, aka [num_seq_blocks, 2]
+      at::Tensor block_indices = get_block_indices<BLOCK_M>(query_start_loc, num_seq_blocks);
+
+      causal_conv1d_fwd_varlen_kernel_impl(
+          out.data_ptr<scalar_t>(),
+          x.data_ptr<scalar_t>(),
+          packed_w.data_ptr<scalar_t>(),
+          conditional_data_ptr<scalar_t>(bias),
+          conditional_data_ptr<scalar_t>(conv_states),
+          conditional_data_ptr<int32_t>(query_start_loc),
+          conditional_data_ptr<int32_t>(conv_state_indices),
+          conditional_data_ptr<bool>(has_initial_state),
+          block_indices.data_ptr<int32_t>(),
+          silu_activation,
+          batch,
+          dim,
+          width,
+          num_seq_blocks);
+    } else {
+      causal_conv1d_fwd_kernel_impl<scalar_t>(
+          out.data_ptr<scalar_t>(),
+          x.data_ptr<scalar_t>(),
+          packed_w.data_ptr<scalar_t>(),
+          conditional_data_ptr<scalar_t>(bias),
+          conditional_data_ptr<scalar_t>(conv_states),
+          conditional_data_ptr<int32_t>(conv_state_indices),
+          conditional_data_ptr<bool>(has_initial_state),
+          silu_activation,
+          batch,
+          dim,
+          seqlen,
+          width,
+          num_seq_blocks);
+    }
+  });
+  return out;
+}
+
+// API aligned with GPUs
+//
+//   x: (batch, dim) or (batch, dim, seqlen)
+//   conv_state: (..., dim, state_len), where state_len >= width - 1
+//   weight: (dim, width)
+//   bias: (dim,)
+//   cache_seqlens: (batch,), dtype int32.
+//   conv_state_indices: (batch,), dtype int32
+//   pad_slot_id: int
+//   out: (batch, dim) or (batch, dim, seqlen)
+//
+at::Tensor causal_conv1d_update_cpu(
+    const at::Tensor& x,
+    const at::Tensor& conv_states,
+    const at::Tensor& weight,
+    const std::optional<at::Tensor>& bias,
+    bool silu_activation,
+    const std::optional<at::Tensor>& cache_seqlens,
+    const std::optional<at::Tensor>& conv_state_indices,
+    int64_t pad_slot_id,
+    bool is_vnni) {
+  CHECK_CONTIGUOUS(x);
+  CHECK_CONTIGUOUS(weight);
+  auto packed_w = is_vnni ? weight : causal_conv1d_weight_pack(weight);
+
+  // TODO: add multi-token prediction support
+  TORCH_CHECK(x.dim() == 2, "causal_conv1d_update_cpu: expect x to be 2D tensor.");
+  TORCH_CHECK(!cache_seqlens.has_value(), "causal_conv1d_update_cpu: don't support cache_seqlens.");
+
+  int64_t batch = x.size(0);
+  int64_t dim = x.size(1);
+  int64_t seqlen = 1;
+  int64_t width = weight.size(-1);
+
+  const auto scalar_type = x.scalar_type();
+  CHECK_EQ(weight.scalar_type(), scalar_type);
+  CHECK_OPTIONAL_SHAPE_DTYPE(bias, dim, scalar_type);
+  CHECK_OPTIONAL_SHAPE_DTYPE(conv_state_indices, batch, at::kInt);
+
+  CHECK_EQ(conv_states.scalar_type(), scalar_type);
+  CHECK_EQ(conv_states.size(1), dim);
+  CHECK_EQ(conv_states.size(2), width - 1);
+
+  // adjust `conv_states` to be contiguous on `dim`
+  if (conv_states.stride(-2) != 1) {
+    int64_t num_cache_lines = conv_states.size(0);
+    auto conv_states_copy = conv_states.clone();
+    conv_states.as_strided_({num_cache_lines, dim, width - 1}, {(width - 1) * dim, 1, dim});
+    conv_states.copy_(conv_states_copy);
+  }
+
+  at::Tensor out = at::empty_like(x);
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(scalar_type, "causal_conv1d_update_kernel_impl", [&] {
+    causal_conv1d_update_kernel_impl<scalar_t>(
+        out.data_ptr<scalar_t>(),
+        x.data_ptr<scalar_t>(),
+        conv_states.data_ptr<scalar_t>(),
+        packed_w.data_ptr<scalar_t>(),
+        conditional_data_ptr<scalar_t>(bias),
+        conditional_data_ptr<int32_t>(conv_state_indices),
+        silu_activation,
+        batch,
+        dim,
+        seqlen,
+        width);
+  });
+  return out;
+}
diff --git a/csrc/cpu/sgl-kernels/fla.cpp b/csrc/cpu/sgl-kernels/fla.cpp
new file mode 100644
index 000000000000..b78e9863bf06
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/fla.cpp
@@ -0,0 +1,1394 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
+
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
+#include "vec_pack.h"
+
+namespace {
+// For this cpu kernel, we have some innovations aside from the existing gpu kernels:
+// 1) Use less parallel loops, i.e. 4 including l2_norm.
+// 2) Fuse part of l2_norm with the rest of the computation.
+
+#define THREAD_BUFFER_ALLOC(dst, base_ptr, offset, type, size) \
+  type* dst = reinterpret_cast<type*>((base_ptr) + (offset));  \
+  offset += (size);
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, float val, int size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  constexpr int kVecSize = Vec::size();
+  const Vec data_vec = Vec(static_cast<scalar_t>(val));
+  int d = 0;
+#pragma GCC unroll 4
+  for (; d <= size - kVecSize; d += kVecSize) {
+    data_vec.store(out + d);
+  }
+  if (size - d > 0) {
+    data_vec.store(out + d, size - d);
+  }
+}
+
+template <typename scalar_t, int64_t chunk_size = 64>
+void chunk_gated_delta_rule_kernel_impl(
+    scalar_t* __restrict__ out,                  // [B, T, HV, EV]
+    float* __restrict__ final_state_data,        // [N, HV, EK, EV]
+    const scalar_t* __restrict__ q_orig,         // [B, T, HK, EK]
+    const scalar_t* __restrict__ k_orig,         // [B, T, HK, EK]
+    const scalar_t* __restrict__ v_orig,         // [B, T, HV, EV]
+    const float* __restrict__ g_orig,            // [B, T, HV] FP32
+    const scalar_t* __restrict__ b_orig,         // [B, T, HV]
+    const int32_t* __restrict__ cu_seqlens_ptr,  // [N + 1] INT32
+    float* __restrict__ buff,
+    scalar_t* __restrict__ reduced_buff,
+    scalar_t* __restrict__ thread_buff,
+    const int32_t* __restrict__ chunk_offsets_ptr,
+    const int32_t* __restrict__ chunk_indices_ptr,
+    bool use_qk_l2norm_in_kernel,
+    const int64_t& batch_size,
+    const int64_t& global_seq_len,
+    const int64_t& qk_num_head,
+    const int64_t& v_num_head,
+    const int64_t& qk_head_size,
+    const int64_t& v_head_size,
+    const int64_t& qStrideH,
+    const int64_t& qStrideT,
+    const int64_t& kStrideH,
+    const int64_t& kStrideT,
+    const int64_t& vStrideH,
+    const int64_t& vStrideT,
+    const int64_t& oStrideH,
+    const int64_t& oStrideT,
+    const int64_t& global_total_seq_length,
+    const int64_t& global_num_chunk,
+    const int64_t& buff_size_16bit_per_thread,
+    double eps = 1e-5) {
+  int64_t gStrideH = 1;
+  int64_t gStrideT = v_num_head;
+  int64_t bStrideH = 1;
+  int64_t bStrideT = v_num_head;
+  int64_t final_state_StrideN = v_num_head * qk_head_size * v_head_size;
+  int64_t final_state_StrideH = qk_head_size * v_head_size;
+  int64_t final_state_StrideE = v_head_size;
+  int64_t head_group = v_num_head / qk_num_head;
+  float scale = 1.0 / std::sqrt(qk_head_size);
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int64_t VecSize = bVec::size();
+  constexpr int64_t fVecSize = fVec::size();
+
+  // Data pointers
+  float* g_pad = buff;
+  float* core_attn_out = g_pad + v_num_head * global_total_seq_length;
+  float* decay_mask = core_attn_out + batch_size * v_num_head * global_total_seq_length * v_head_size;
+  float* v_beta_attn = decay_mask + v_num_head * global_total_seq_length * chunk_size;
+
+  scalar_t* q_pad = reduced_buff;
+  scalar_t* k_pad = q_pad + qk_num_head * global_total_seq_length * qk_head_size;
+  scalar_t* v_pad = k_pad + qk_num_head * global_total_seq_length * qk_head_size;
+  scalar_t* k_beta = v_pad + v_num_head * global_total_seq_length * v_head_size;
+  scalar_t* v_beta = k_beta + v_num_head * global_total_seq_length * qk_head_size;
+  scalar_t* k_cumdecay_reduced = v_beta + v_num_head * global_total_seq_length * v_head_size;
+  scalar_t* q_norm_sum = k_cumdecay_reduced + v_num_head * global_total_seq_length * qk_head_size;
+  scalar_t* k_norm_sum = q_norm_sum + qk_num_head * global_seq_len;
+
+  if (use_qk_l2norm_in_kernel) {
+    at::parallel_for(0, qk_num_head * global_seq_len, 0, [&](int64_t begin, int64_t end) {
+      int64_t h_qk = 0, l = 0;
+      data_index_init(begin, h_qk, qk_num_head, l, global_seq_len);
+      for (int64_t i = begin; i < end; ++i) {
+        auto q_norm_sum_ptr = q_norm_sum + h_qk * global_seq_len + l;
+        auto k_norm_sum_ptr = k_norm_sum + h_qk * global_seq_len + l;
+        float sum_q = float(0);
+        float sum_k = float(0);
+        fVec sum_q_fvec = fVec(float(0));
+        fVec sum_k_fvec = fVec(float(0));
+        int64_t q_offset = l * qStrideT + h_qk * qStrideH;
+        int64_t k_offset = l * qStrideT + h_qk * qStrideH;
+        int64_t d;
+        for (d = 0; d <= qk_head_size - VecSize; d += VecSize) {
+          bVec q_bvec = bVec::loadu(q_orig + q_offset + d);
+          fVec q_fvec0, q_fvec1;
+          std::tie(q_fvec0, q_fvec1) = at::vec::convert_to_float(q_bvec);
+          sum_q_fvec += q_fvec0 * q_fvec0;
+          sum_q_fvec += q_fvec1 * q_fvec1;
+          bVec k_bvec = bVec::loadu(k_orig + k_offset + d);
+          fVec k_fvec0, k_fvec1;
+          std::tie(k_fvec0, k_fvec1) = at::vec::convert_to_float(k_bvec);
+          sum_k_fvec += k_fvec0 * k_fvec0;
+          sum_k_fvec += k_fvec1 * k_fvec1;
+        }
+        sum_q += vec_reduce_sum(sum_q_fvec);
+        sum_k += vec_reduce_sum(sum_k_fvec);
+        q_norm_sum_ptr[0] = static_cast<scalar_t>(float(1) / std::sqrt(sum_q + eps));
+        k_norm_sum_ptr[0] = static_cast<scalar_t>(float(1) / std::sqrt(sum_k + eps));
+        data_index_step(h_qk, qk_num_head, l, global_seq_len);
+      }
+    });
+  }
+
+  // query = query * scale
+  // k_beta = key * beta.unsqueeze(-1)
+  // v_beta = value * beta.unsqueeze(-1)
+  // Padding for q/k/v/beta
+  at::parallel_for(0, qk_num_head * global_num_chunk, 1, [&](int64_t begin, int64_t end) {
+    int ompIdx = at::get_thread_num();
+    int64_t h_qk = 0, c = 0;
+    data_index_init(begin, h_qk, qk_num_head, c, global_num_chunk);
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      int64_t ib = chunk_indices_ptr[c * 2];      // idx_batch
+      int64_t ic = chunk_indices_ptr[c * 2 + 1];  // idx_chunk
+      int64_t l_orig = cu_seqlens_ptr[ib] + ic * chunk_size;
+      int64_t l = c * chunk_size;
+      bool is_tail = (c + 1 == chunk_offsets_ptr[ib + 1]);
+      int64_t seq_len = cu_seqlens_ptr[ib + 1] - cu_seqlens_ptr[ib];
+      int64_t real_chunk_size = is_tail ? seq_len - ic * chunk_size : chunk_size;
+      auto q_orig_ptr = q_orig + h_qk * qStrideH + l_orig * qStrideT;
+      auto k_orig_ptr = k_orig + h_qk * kStrideH + l_orig * kStrideT;
+      auto v_orig_ptr = v_orig + l_orig * vStrideT;
+      auto b_orig_ptr = b_orig + l_orig * bStrideT;
+      auto q_pad_ptr = q_pad + h_qk * global_total_seq_length * qk_head_size + l * qk_head_size;
+      auto k_pad_ptr = k_pad + h_qk * global_total_seq_length * qk_head_size + l * qk_head_size;
+      auto v_pad_ptr = v_pad + l * v_head_size;
+      auto k_beta_ptr = k_beta + l * qk_head_size;
+      auto v_beta_ptr = v_beta + l * v_head_size;
+
+      for (int64_t j = 0; j < real_chunk_size; j++) {
+        auto curr_q_orig = q_orig_ptr + j * qStrideT;
+        auto curr_k_orig = k_orig_ptr + j * kStrideT;
+        auto curr_q_pad = q_pad_ptr + j * qk_head_size;
+        auto curr_k_pad = k_pad_ptr + j * qk_head_size;
+        auto q_scale =
+            use_qk_l2norm_in_kernel ? *(q_norm_sum + h_qk * global_seq_len + l_orig + j) : static_cast<scalar_t>(1);
+        auto k_scale =
+            use_qk_l2norm_in_kernel ? *(k_norm_sum + h_qk * global_seq_len + l_orig + j) : static_cast<scalar_t>(1);
+        auto q_scale_vec = bVec(q_scale);
+        auto k_scale_vec = bVec(k_scale);
+        int64_t i = 0;
+        scalar_t scale_reduced = static_cast<scalar_t>(scale);
+        auto vec_scale_reduced = bVec(scale_reduced);
+        for (; i < fVecSize * (qk_head_size / fVecSize); i += fVecSize) {
+          auto tmp0 = bVec::loadu(curr_q_orig + i, fVecSize);
+          auto tmp1 = tmp0 * q_scale_vec * vec_scale_reduced;
+          tmp1.store(curr_q_pad + i, fVecSize);
+          auto tmp3 = bVec::loadu(curr_k_orig + i, fVecSize);
+          auto tmp4 = tmp3 * k_scale_vec;
+          tmp4.store(curr_k_pad + i, fVecSize);
+        }
+
+        for (auto hi = 0; hi < head_group; hi++) {
+          int64_t h = h_qk * head_group + hi;
+          auto curr_v_orig = v_orig_ptr + h * vStrideH + j * vStrideT;
+          auto curr_b_orig = b_orig_ptr + h * bStrideH + j * bStrideT;
+          scalar_t b_orig_val_reduced = *(curr_b_orig);
+          auto curr_v_pad = v_pad_ptr + h * global_total_seq_length * v_head_size + j * v_head_size;
+          auto curr_k_beta = k_beta_ptr + h * global_total_seq_length * qk_head_size + j * qk_head_size;
+          auto curr_v_beta = v_beta_ptr + h * global_total_seq_length * v_head_size + j * v_head_size;
+
+          // query = query * scale
+          // k_beta = key * beta.unsqueeze(-1)
+          int64_t i = 0;
+          auto vec_b_reduced = bVec(b_orig_val_reduced);
+          for (; i < fVecSize * (qk_head_size / fVecSize); i += fVecSize) {
+            auto tmp0 = bVec::loadu(curr_k_orig + i, fVecSize);
+            auto tmp2 = tmp0 * k_scale_vec * vec_b_reduced;
+            tmp2.store(curr_k_beta + i, fVecSize);
+          }
+          // v_beta = value * beta.unsqueeze(-1)
+          i = 0;
+          for (; i < VecSize * (v_head_size / VecSize); i += VecSize) {
+            auto tmp3 = bVec::loadu(curr_v_orig + i);
+            tmp3.store(curr_v_pad + i);
+            auto tmp5 = tmp3 * vec_b_reduced;
+            tmp5.store(curr_v_beta + i);
+          }
+        }
+      }
+
+      for (int64_t j = real_chunk_size; j < chunk_size; j++) {
+        auto curr_q_pad = q_pad_ptr + j * qk_head_size;
+        auto curr_k_pad = k_pad_ptr + j * qk_head_size;
+        int64_t i = 0;
+        auto vec_zero = bVec(0.0);
+        for (; i < VecSize * (qk_head_size / VecSize); i += VecSize) {
+          vec_zero.store(curr_q_pad + i);
+          vec_zero.store(curr_k_pad + i);
+        }
+        for (auto hi = 0; hi < head_group; hi++) {
+          int64_t h = h_qk * head_group + hi;
+          auto curr_v_pad = v_pad_ptr + h * global_total_seq_length * v_head_size + j * v_head_size;
+          auto curr_k_beta = k_beta_ptr + h * global_total_seq_length * qk_head_size + j * qk_head_size;
+          auto curr_v_beta = v_beta_ptr + h * global_total_seq_length * v_head_size + j * v_head_size;
+          int64_t i = 0;
+          for (; i < VecSize * (qk_head_size / VecSize); i += VecSize) {
+            vec_zero.store(curr_k_beta + i);
+          }
+          i = 0;
+          for (; i < VecSize * (v_head_size / VecSize); i += VecSize) {
+            vec_zero.store(curr_v_pad + i);
+            vec_zero.store(curr_v_beta + i);
+          }
+        }
+      }
+      // Move to the next query
+      data_index_step(h_qk, qk_num_head, c, global_num_chunk);
+    }
+  });
+
+  at::parallel_for(0, v_num_head * global_num_chunk, 1, [&](int64_t begin, int64_t end) {
+    int64_t h = 0, c = 0;
+    data_index_init(begin, h, v_num_head, c, global_num_chunk);
+    int ompIdx = at::get_thread_num();
+    int64_t offset = 0;
+    scalar_t* thread_buff_ptr = thread_buff + ompIdx * buff_size_16bit_per_thread;
+    THREAD_BUFFER_ALLOC(k_transpose, thread_buff_ptr, offset, scalar_t, qk_head_size * chunk_size);
+    THREAD_BUFFER_ALLOC(v_pack, thread_buff_ptr, offset, scalar_t, chunk_size * v_head_size);
+    THREAD_BUFFER_ALLOC(k_beta_g, thread_buff_ptr, offset, scalar_t, chunk_size * qk_head_size);
+    THREAD_BUFFER_ALLOC(k_beta_g_pack, thread_buff_ptr, offset, scalar_t, chunk_size * qk_head_size);
+    THREAD_BUFFER_ALLOC(curr_attn, thread_buff_ptr, offset, float, chunk_size* chunk_size * 2);
+    THREAD_BUFFER_ALLOC(curr_attn_reduced, thread_buff_ptr, offset, scalar_t, chunk_size * chunk_size);
+    THREAD_BUFFER_ALLOC(k_cumdecay, thread_buff_ptr, offset, float, chunk_size* qk_head_size * 2);
+    THREAD_BUFFER_ALLOC(row, thread_buff_ptr, offset, float, chunk_size * 2);
+    THREAD_BUFFER_ALLOC(updated, thread_buff_ptr, offset, float, chunk_size * 2);
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      int64_t ib = chunk_indices_ptr[c * 2];      // idx_batch
+      int64_t ic = chunk_indices_ptr[c * 2 + 1];  // idx_chunk
+      int64_t l_orig = cu_seqlens_ptr[ib] + ic * chunk_size;
+      int64_t seq_len = cu_seqlens_ptr[ib + 1] - cu_seqlens_ptr[ib];
+      int64_t h_qk = h / head_group;
+      auto curr_g_orig = g_orig + h * gStrideH + l_orig * gStrideT;
+      auto curr_g_pad = g_pad + h * global_total_seq_length + c * chunk_size;
+      auto curr_decay_mask = decay_mask + h * global_total_seq_length * chunk_size + c * chunk_size * chunk_size;
+      auto curr_k_pad = k_pad + h_qk * global_total_seq_length * qk_head_size + c * chunk_size * qk_head_size;
+      auto curr_k_beta = k_beta + h * global_total_seq_length * qk_head_size + c * chunk_size * qk_head_size;
+      auto curr_k_cumdecay_reduced =
+          k_cumdecay_reduced + h * global_total_seq_length * qk_head_size + c * chunk_size * qk_head_size;
+      auto curr_v_beta = v_beta + h * global_total_seq_length * v_head_size + c * chunk_size * v_head_size;
+      auto curr_value = v_beta_attn + h * global_total_seq_length * v_head_size + c * chunk_size * v_head_size;
+
+      float acc_val = 0;
+      for (int64_t i = 0; i < chunk_size; i++) {
+        // Padding for g
+        // g = g.cumsum(dim=-1)
+        // g: [B, HV, num_chunk, chunk_size]
+        if (ic * chunk_size + i < seq_len) {
+          acc_val += curr_g_orig[i * gStrideT];
+        }
+        curr_g_pad[i] = acc_val;
+        // decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
+        // decay_mask: [B, HV, num_chunk, chunk_size, chunk_size]
+        float curr_g_pad_i = static_cast<float>(curr_g_pad[i]);
+        auto vec_curr_g_pad_i = fVec(curr_g_pad_i);
+        int64_t j = 0;
+        int64_t len = i + 1;
+        for (; j < fVecSize * (len / fVecSize); j += fVecSize) {
+          auto tmp0 = fVec::loadu(curr_g_pad + j);
+          auto tmp1 = vec_curr_g_pad_i - tmp0;
+          auto tmp2 = tmp1.exp_u20();
+          tmp2.store(curr_decay_mask + i * chunk_size + j);
+        }
+        if (j < len) {
+          auto tmp0 = fVec::loadu(curr_g_pad + j, len - j);
+          auto tmp1 = vec_curr_g_pad_i - tmp0;
+          auto tmp2 = tmp1.exp_u20();
+          tmp2.store(curr_decay_mask + i * chunk_size + j, len - j);
+        }
+      }
+
+      // attn = k_beta @ key.transpose(-1, -2)
+      // attn: [B, HV, num_chunk, chunk_size, chunk_size]
+      // transpose and pack for key
+      pack_vnni<scalar_t>(
+          /*    dst */ k_transpose,
+          /*    src */ curr_k_pad,
+          /*     N  */ chunk_size,
+          /*     K  */ qk_head_size,
+          /* ld_src */ qk_head_size,
+          /* ld_dst */ chunk_size);
+      // k_beta @ key.transpose(-1, -2)
+      at::native::cpublas::brgemm(
+          /*     M */ chunk_size,
+          /*     N */ chunk_size,
+          /*     K */ qk_head_size,
+          /*   lda */ qk_head_size,
+          /*   ldb */ chunk_size,
+          /*   ldc */ chunk_size,
+          /* add_C */ false,
+          /*     A */ curr_k_beta,
+          /*     B */ k_transpose,
+          /*     C */ curr_attn);
+      // attn = attn * decay_mask
+      for (int64_t m = 0; m < chunk_size; m++) {
+        at::vec::map2<float>(
+            [](fVec x, fVec y) { return fVec(0) - x * y; },
+            curr_attn + m * chunk_size,
+            curr_attn + m * chunk_size,
+            curr_decay_mask + m * chunk_size,
+            chunk_size);
+      }
+
+      // chunk decay
+      // attn: [B, HV, num_chunk, chunk_size, chunk_size]
+      // mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)
+      // attn = -attn.masked_fill(mask, 0)
+      // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) [B, HV, num_chunk, i]
+      // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
+      // attn = -attn.masked_fill(mask, 0)
+      for (int i = 0; i < chunk_size; i++) {
+        const auto vec_zero = fVec(0);
+        int64_t len = chunk_size - i;
+        int64_t front = len % fVecSize;
+        int64_t j = i;
+        // first masked vec for alignment
+        if (front > 0) {
+          vec_zero.store(curr_attn + i * chunk_size + j, front);
+          j += front;
+        }
+        for (; j < fVecSize * (chunk_size / fVecSize); j += fVecSize) {
+          vec_zero.store(curr_attn + i * chunk_size + j);
+        }
+      }
+      for (int i = 1; i < chunk_size; i++) {
+        // row = attn[..., i, :i] [B, HK, num_chunk, i]
+        int64_t j = 0;
+        int64_t len = i;
+        for (; j < fVecSize * (len / fVecSize); j += fVecSize) {
+          auto tmp0 = fVec::loadu(curr_attn + i * chunk_size + j);
+          tmp0.store(row + j);
+        }
+        if (j < len) {
+          auto tmp0 = fVec::loadu(curr_attn + i * chunk_size + j, len - j);
+          tmp0.store(row + j, len - j);
+        }
+        // (row.unsqueeze(-1) * sub).sum(-2)
+        fill_stub(updated, 0, i);
+        for (int k = 0; k < i; k++) {
+          float row_k = row[k];
+          auto vec_row_k = fVec(row_k);
+          int64_t j = 0;
+          int64_t len = i;
+          for (; j < fVecSize * (len / fVecSize); j += fVecSize) {
+            auto tmp0 = fVec::loadu(curr_attn + k * chunk_size + j);
+            auto tmp1 = vec_row_k * tmp0;
+            auto tmp2 = fVec::loadu(updated + j);
+            auto tmp3 = tmp1 + tmp2;
+            tmp3.store(updated + j);
+          }
+          if (j < len) {
+            auto tmp0 = fVec::loadu(curr_attn + k * chunk_size + j, len - j);
+            auto tmp1 = vec_row_k * tmp0;
+            auto tmp2 = fVec::loadu(updated + j);
+            auto tmp3 = tmp1 + tmp2;
+            tmp3.store(updated + j, len - j);
+          }
+        }
+        // attn[..., i, :i] = row + sum(...)
+        j = 0;
+        len = i;
+        for (; j < fVecSize * (len / fVecSize); j += fVecSize) {
+          auto tmp0 = fVec::loadu(row + j);
+          auto tmp1 = fVec::loadu(updated + j);
+          auto tmp2 = tmp0 + tmp1;
+          tmp2.store(curr_attn + i * chunk_size + j);
+        }
+        if (j < len) {
+          auto tmp0 = fVec::loadu(row + j, len - j);
+          auto tmp1 = fVec::loadu(updated + j, len - j);
+          auto tmp2 = tmp0 + tmp1;
+          tmp2.store(curr_attn + i * chunk_size + j, len - j);
+        }
+      }
+      for (int i = 0; i < chunk_size; i++) {
+        curr_attn[i * chunk_size + i] += 1.0f;
+        at::vec::map<scalar_t>(
+            [](fVec x) { return x; }, curr_attn_reduced + i * chunk_size, curr_attn + i * chunk_size, chunk_size);
+      }
+
+      // v_beta_attn = attn @ v_beta
+      // k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
+      // v_beta_attn: [B, HV, num_chunk, chunk_size, EV]
+      // k_beta_g = k_beta * g: [B, HV, num_chunk, chunk_size, EK]
+      // k_cumdecay: [B, HV, num_chunk, chunk_size, EK]
+      // pack for value
+      pack_vnni2<scalar_t>(
+          /*    dst */ v_pack,
+          /*    src */ curr_v_beta,
+          /*     N  */ chunk_size,
+          /*     K  */ v_head_size,
+          /* ld_src */ v_head_size,
+          /* ld_dst */ v_head_size);
+      // value = attn @ v_beta
+      at::native::cpublas::brgemm(
+          /*     M */ chunk_size,
+          /*     N */ v_head_size,
+          /*     K */ chunk_size,
+          /*   lda */ chunk_size,
+          /*   ldb */ v_head_size,
+          /*   ldc */ v_head_size,
+          /* add_C */ false,
+          /*     A */ curr_attn_reduced,
+          /*     B */ v_pack,
+          /*     C */ curr_value);
+      // k_beta_g = k_beta * g.exp().unsqueeze(-1)
+      for (int64_t j = 0; j < chunk_size; j++) {
+        int64_t i = 0;
+        float g_exp = std::exp(curr_g_pad[j]);
+        scalar_t g_exp_reduced = static_cast<scalar_t>(g_exp);
+        auto vec_g_exp_reduced = bVec(g_exp_reduced);
+        for (; i < VecSize * (qk_head_size / VecSize); i += VecSize) {
+          auto tmp0 = bVec::loadu(curr_k_beta + j * qk_head_size + i);
+          auto tmp1 = tmp0 * vec_g_exp_reduced;
+          tmp1.store(k_beta_g + j * qk_head_size + i);
+        }
+      }
+      // pack for k_beta_g
+      pack_vnni2<scalar_t>(
+          /*    dst */ k_beta_g_pack,
+          /*    src */ k_beta_g,
+          /*     N  */ chunk_size,
+          /*     K  */ qk_head_size,
+          /* ld_src */ qk_head_size,
+          /* ld_dst */ qk_head_size);
+      // k_cumdecay = attn @ k_beta_g
+      at::native::cpublas::brgemm(
+          /*     M */ chunk_size,
+          /*     N */ qk_head_size,
+          /*     K */ chunk_size,
+          /*   lda */ chunk_size,
+          /*   ldb */ qk_head_size,
+          /*   ldc */ qk_head_size,
+          /* add_C */ false,
+          /*     A */ curr_attn_reduced,
+          /*     B */ k_beta_g_pack,
+          /*     C */ k_cumdecay);
+      for (int i = 0; i < chunk_size; i++) {
+        at::vec::map<scalar_t>(
+            [](fVec x) { return x; },
+            curr_k_cumdecay_reduced + i * qk_head_size,
+            k_cumdecay + i * qk_head_size,
+            qk_head_size);
+      }
+
+      // Move to the next query
+      data_index_step(h, v_num_head, c, global_num_chunk);
+    }
+  });
+
+  // for each chunk
+  at::parallel_for(0, batch_size * v_num_head, 1, [&](int64_t begin, int64_t end) {
+    int64_t b = 0, h = 0;
+    data_index_init(begin, b, batch_size, h, v_num_head);
+    int ompIdx = at::get_thread_num();
+    int64_t offset =
+        /* k_transpose */ qk_head_size * chunk_size +
+        /* v_pack */ chunk_size * v_head_size +
+        /* k_beta_g  */ chunk_size * qk_head_size +
+        /* k_beta_g_pack  */ chunk_size * qk_head_size +
+        /* attn */ chunk_size * chunk_size * 2 +
+        /* attn_reduced */ chunk_size * chunk_size +
+        /* k_cumdecay */ chunk_size * qk_head_size * 2 +
+        /* row */ chunk_size * 2 +
+        /* updated */ chunk_size * 2;
+    scalar_t* thread_buff_ptr = thread_buff + ompIdx * buff_size_16bit_per_thread;
+    THREAD_BUFFER_ALLOC(
+        curr_last_recurrent_state_reduced, thread_buff_ptr, offset, scalar_t, qk_head_size * v_head_size);
+    THREAD_BUFFER_ALLOC(
+        curr_last_recurrent_state_pack_reduced, thread_buff_ptr, offset, scalar_t, qk_head_size * v_head_size);
+    THREAD_BUFFER_ALLOC(k_transpose_i, thread_buff_ptr, offset, scalar_t, qk_head_size * chunk_size);
+    THREAD_BUFFER_ALLOC(attn_i, thread_buff_ptr, offset, float, chunk_size* chunk_size * 2);
+    THREAD_BUFFER_ALLOC(attn_i_reduced, thread_buff_ptr, offset, scalar_t, chunk_size * chunk_size);
+    THREAD_BUFFER_ALLOC(v_prime, thread_buff_ptr, offset, float, chunk_size* v_head_size * 2);
+    THREAD_BUFFER_ALLOC(v_prime_reduced, thread_buff_ptr, offset, scalar_t, chunk_size * v_head_size);
+    THREAD_BUFFER_ALLOC(v_prime_pack_reduced, thread_buff_ptr, offset, scalar_t, chunk_size * v_head_size);
+    THREAD_BUFFER_ALLOC(qg, thread_buff_ptr, offset, scalar_t, chunk_size * qk_head_size);
+    THREAD_BUFFER_ALLOC(attn_inter, thread_buff_ptr, offset, float, chunk_size* v_head_size * 2);
+    THREAD_BUFFER_ALLOC(kg, thread_buff_ptr, offset, scalar_t, chunk_size * qk_head_size);
+    THREAD_BUFFER_ALLOC(kg_transpose, thread_buff_ptr, offset, scalar_t, qk_head_size * chunk_size);
+    THREAD_BUFFER_ALLOC(kgv, thread_buff_ptr, offset, float, qk_head_size* v_head_size * 2);
+
+    for ([[maybe_unused]] auto z : c10::irange(begin, end)) {
+      int64_t start_q = cu_seqlens_ptr[b];
+      int64_t seq_len = cu_seqlens_ptr[b + 1] - start_q;
+      int64_t num_chunk = chunk_offsets_ptr[b + 1] - chunk_offsets_ptr[b];
+      int64_t chunk_offset = chunk_offsets_ptr[b];
+      int64_t len_offset = chunk_offset * chunk_size;
+
+      int64_t h_qk = h / head_group;
+      auto out_ptr = out + start_q * oStrideT;
+      auto curr_q = q_pad + len_offset * qk_head_size +
+                    h_qk * global_total_seq_length * qk_head_size;  // [num_chunk, chunk_size, EK]
+      auto curr_k = k_pad + len_offset * qk_head_size +
+                    h_qk * global_total_seq_length * qk_head_size;            // [num_chunk, chunk_size, EK]
+      auto curr_v = v_beta_attn + h * global_total_seq_length * v_head_size;  // [num_chunk, chunk_size, EV]
+      auto curr_decay_mask =
+          decay_mask + h * global_total_seq_length * chunk_size;  // [num_chunk, chunk_size, chunk_size]
+      auto curr_k_cumdecay_reduced =
+          k_cumdecay_reduced + h * global_total_seq_length * qk_head_size;  // [num_chunk, chunk_size, EK]
+      auto curr_last_recurrent_state =
+          final_state_data + b * final_state_StrideN + h * final_state_StrideH;  // [EK, EV]
+      auto curr_g_pad = g_pad + len_offset + h * global_total_seq_length;        // [num_chunk, chunk_size]
+      auto curr_core_attn_out = core_attn_out + len_offset * v_head_size +
+                                h * global_total_seq_length * v_head_size;  // [num_chunk, chunk_size, EV]
+      for (int64_t c = 0; c < num_chunk; c++) {
+        for (int i = 0; i < qk_head_size; i++) {
+          at::vec::map<scalar_t>(
+              [](fVec x) { return x; },
+              curr_last_recurrent_state_reduced + i * v_head_size,
+              curr_last_recurrent_state + i * v_head_size,
+              v_head_size);
+        }
+        auto q_i = curr_q + c * chunk_size * qk_head_size;                                   // [chunk_size, EK]
+        auto k_i = curr_k + c * chunk_size * qk_head_size;                                   // [chunk_size, EK]
+        auto v_i = curr_v + (chunk_offset + c) * chunk_size * v_head_size;                   // [chunk_size, EV]
+        auto decay_mask_i = curr_decay_mask + (chunk_offset + c) * chunk_size * chunk_size;  // [chunk_size, chunk_size]
+        auto k_cumdecay_i_reduced =
+            curr_k_cumdecay_reduced + (chunk_offset + c) * chunk_size * qk_head_size;  // [chunk_size, EK]
+        auto g_pad_i = curr_g_pad + c * chunk_size;                                    // [chunk_size]
+        auto core_attn_out_i = curr_core_attn_out + c * chunk_size * v_head_size;      // [chunk_size, EV]
+
+        // attn_i = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
+        // k_transpose_i = k_i.transpose(-1, -2)
+        pack_vnni<scalar_t>(
+            /*    dst */ k_transpose_i,
+            /*    src */ k_i,
+            /*     N  */ chunk_size,
+            /*     K  */ qk_head_size,
+            /* ld_src */ qk_head_size,
+            /* ld_dst */ chunk_size);
+        // attn_i = q_i @ k_transpose_i
+        at::native::cpublas::brgemm(
+            /* M */ chunk_size,
+            /* N */ chunk_size,
+            /* K */ qk_head_size,
+            /* lda */ qk_head_size,
+            /* ldb */ chunk_size,
+            /* ldc */ chunk_size,
+            /* add_C */ false,
+            /* A */ q_i,
+            /* B */ k_transpose_i,
+            /* C */ attn_i);
+        // attn_i = attn_i * decay_mask_i
+        for (int64_t m = 0; m < chunk_size; m++) {
+          auto attn_i_m = attn_i + m * chunk_size;
+          auto attn_i_reduced_m = attn_i_reduced + m * chunk_size;
+          auto decay_mask_i_m = decay_mask_i + m * chunk_size;
+          int64_t n = 0;
+          for (; n < fVecSize * (chunk_size / fVecSize); n += fVecSize) {
+            auto tmp0 = fVec::loadu(attn_i_m + n);
+            auto tmp1 = fVec::loadu(decay_mask_i_m + n);
+            auto tmp2 = tmp0 * tmp1;
+            auto tmp3 = at::vec::convert<scalar_t>(tmp2);
+            tmp3.store(attn_i_reduced_m + n, fVecSize);
+          }
+          if (n < chunk_size) {
+            auto tmp0 = fVec::loadu(attn_i_m + n, chunk_size - n);
+            auto tmp1 = fVec::loadu(decay_mask_i_m + n, chunk_size - n);
+            auto tmp2 = tmp0 * tmp1;
+            auto tmp3 = at::vec::convert<scalar_t>(tmp2);
+            tmp3.store(attn_i_reduced_m + n, chunk_size - n);
+          }
+        }
+        // mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)
+        // attn_i = attn_i.masked_fill_(mask, 0)
+        for (int i = 0; i < chunk_size - 1; i++) {
+          const auto vec_zero = bVec(0);
+          int64_t len = chunk_size - i - 1;
+          int64_t front = len % VecSize;
+          int64_t j = i + 1;
+          // first masked vec for alignment
+          if (front > 0) {
+            vec_zero.store(attn_i_reduced + i * chunk_size + j, front);
+            j += front;
+          }
+          for (; j < VecSize * (chunk_size / VecSize); j += VecSize) {
+            vec_zero.store(attn_i_reduced + i * chunk_size + j);
+          }
+        }
+
+        // pack for curr_last_recurrent_state
+        pack_vnni2<scalar_t>(
+            /*    dst */ curr_last_recurrent_state_pack_reduced,
+            /*    src */ curr_last_recurrent_state_reduced,
+            /*     N  */ qk_head_size,
+            /*     K  */ v_head_size,
+            /* ld_src */ v_head_size,
+            /* ld_dst */ v_head_size);
+
+        // v_prime = k_cumdecay_i @ curr_last_recurrent_state: [chunk_size, EV]
+        // k_cumdecay_i: [chunk_size, EK]
+        // curr_last_recurrent_state: [EK, EV]
+        at::native::cpublas::brgemm(
+            /*     M */ chunk_size,
+            /*     N */ v_head_size,
+            /*     K */ qk_head_size,
+            /*   lda */ qk_head_size,
+            /*   ldb */ v_head_size,
+            /*   ldc */ v_head_size,
+            /* add_C */ false,
+            /*     A */ k_cumdecay_i_reduced,
+            /*     B */ curr_last_recurrent_state_pack_reduced,
+            /*     C */ v_prime);
+
+        // v_new = v_prime = v_i - v_prime
+        // v_i: [chunk_size, EV]
+        for (int64_t m = 0; m < chunk_size; m++) {
+          int64_t i = 0;
+          for (; i < fVecSize * (v_head_size / fVecSize); i += fVecSize) {
+            auto tmp0 = fVec::loadu(v_i + m * v_head_size + i);
+            auto tmp1 = fVec::loadu(v_prime + m * v_head_size + i);
+            auto tmp2 = tmp0 - tmp1;
+            auto tmp3 = at::vec::convert<scalar_t>(tmp2);
+            tmp3.store(v_prime_reduced + m * v_head_size + i, fVecSize);
+          }
+        }
+
+        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
+        // qg = q_i * g[:, :, i, :, None].exp(): [chunk_size, EK]
+        // q_i: [chunk_size, EK]
+        // g[:, :, i, :, None]: [chunk_size, 1]
+        for (int64_t m = 0; m < chunk_size; m++) {
+          auto g_pad_i_m = g_pad_i + m;
+          auto g_exp = std::exp(*g_pad_i_m);
+          int64_t i = 0;
+          scalar_t g_exp_reduced = static_cast<scalar_t>(g_exp);
+          auto vec_g_exp_reduced = bVec(g_exp_reduced);
+          for (; i < VecSize * (qk_head_size / VecSize); i += VecSize) {
+            auto tmp0 = bVec::loadu(q_i + m * qk_head_size + i);
+            auto tmp2 = tmp0 * vec_g_exp_reduced;
+            tmp2.store(qg + m * qk_head_size + i);
+          }
+        }
+        // attn_inter = qg @ curr_last_recurrent_state: [chunk_size, EV]
+        // curr_last_recurrent_state: [EK, EV]
+        at::native::cpublas::brgemm(
+            /* M */ chunk_size,
+            /* N */ v_head_size,
+            /* K */ qk_head_size,
+            /* lda */ qk_head_size,
+            /* ldb */ v_head_size,
+            /* ldc */ v_head_size,
+            /* add_C */ false,
+            /* A */ qg,
+            /* B */ curr_last_recurrent_state_pack_reduced,
+            /* C */ attn_inter);
+
+        // core_attn_out[:, :, i] = attn_inter + attn_i @ v_new
+        // pack for v_prime
+        pack_vnni2<scalar_t>(
+            /*    dst */ v_prime_pack_reduced,
+            /*    src */ v_prime_reduced,
+            /*     N  */ chunk_size,
+            /*     K  */ v_head_size,
+            /* ld_src */ v_head_size,
+            /* ld_dst */ v_head_size);
+        // attn_inter = attn_inter + attn_i @ v_new: [chunk_size, EV]
+        // attn_i: [chunk_size, chunk_size]
+        // v_new: [chunk_size, EV]
+        at::native::cpublas::brgemm(
+            /* M */ chunk_size,
+            /* N */ v_head_size,
+            /* K */ chunk_size,
+            /* lda */ chunk_size,
+            /* ldb */ v_head_size,
+            /* ldc */ v_head_size,
+            /* add_C */ true,
+            /* A */ attn_i_reduced,
+            /* B */ v_prime_pack_reduced,
+            /* C */ attn_inter);
+
+        // core_attn_out[:, :, i] = attn_inter
+        for (int64_t m = 0; m < chunk_size; m++) {
+          at::vec::map<float>(
+              [](fVec x) { return x; }, core_attn_out_i + m * v_head_size, attn_inter + m * v_head_size, v_head_size);
+        }
+
+        // last_recurrent_state = (
+        //     last_recurrent_state * g[:, :, i, -1, None, None].exp()
+        //     + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
+        // )
+        // 1) last_recurrent_state * g[:, :, i, -1, None, None].exp()
+        // curr_last_recurrent_state: [EK, EV]
+        // g[:, :, i, -1, None, None]: [1, 1]
+        // last_recurrent_state * g[:, :, i, -1, None, None].exp(): [EK, EV]
+        auto g_pad_i_last = g_pad_i + chunk_size - 1;
+        auto g_exp_last = std::exp(g_pad_i_last[0]);
+        for (int64_t m = 0; m < qk_head_size; m++) {
+          int64_t i = 0;
+          auto vec_g_exp_last = fVec(g_exp_last);
+          for (; i < fVecSize * (v_head_size / fVecSize); i += fVecSize) {
+            auto tmp0 = bVec::loadu(curr_last_recurrent_state_reduced + m * v_head_size + i);
+            auto tmp1 = at::vec::convert<float>(tmp0);
+            auto tmp2 = tmp1 * vec_g_exp_last;
+            tmp2.store(curr_last_recurrent_state + m * v_head_size + i);
+          }
+          if (i < v_head_size) {
+            auto tmp0 = bVec::loadu(curr_last_recurrent_state_reduced + m * v_head_size + i, v_head_size - i);
+            auto tmp1 = at::vec::convert<float>(tmp0);
+            auto tmp2 = tmp1 * vec_g_exp_last;
+            tmp2.store(curr_last_recurrent_state + m * v_head_size + i, v_head_size - i);
+          }
+        }
+        // 2) (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
+        // k_i: [chunk_size, EK]
+        // g[:, :, i, -1, None]: [1]
+        // g[:, :, i]: [chunk_size]
+        // (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]: [chunk_size, 1]
+        // kg = k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]: [chunk_size, EK]
+        // (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2): [EK, chunk_size]
+        // v_new: [chunk_size, EV]
+        // (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new: [EK, EV]
+        // kg = k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]
+        for (int64_t m = 0; m < chunk_size; m++) {
+          auto g_exp = std::exp((g_pad_i_last[0] - g_pad_i[m]));
+          int64_t i = 0;
+          scalar_t g_exp_reduced = static_cast<scalar_t>(g_exp);
+          auto vec_g_exp_reduced = bVec(g_exp_reduced);
+          for (; i < VecSize * (qk_head_size / VecSize); i += VecSize) {
+            auto tmp0 = bVec::loadu(k_i + m * qk_head_size + i);
+            auto tmp2 = tmp0 * vec_g_exp_reduced;
+            tmp2.store(kg + m * qk_head_size + i);
+          }
+        }
+        // kg.transpose(-1, -2): [EK, chunk_size]
+        at::native::utils::transpose<scalar_t>(
+            /* M */ chunk_size,
+            /* N */ qk_head_size,
+            /* src */ kg,
+            /* ld_src */ qk_head_size,
+            /* dst */ kg_transpose,
+            /* ld_dst */ chunk_size);
+        // kgv = kg.transpose(-1, -2) @ v_new
+        // v_new: [chunk_size, EV]
+        at::native::cpublas::brgemm(
+            /* M */ qk_head_size,
+            /* N */ v_head_size,
+            /* K */ chunk_size,
+            /* lda */ chunk_size,
+            /* ldb */ v_head_size,
+            /* ldc */ v_head_size,
+            /* add_C */ false,
+            /* A */ kg_transpose,
+            /* B */ v_prime_pack_reduced,
+            /* C */ kgv);
+        // last_recurrent_state = 1) + 2)
+        for (int64_t m = 0; m < qk_head_size; m++) {
+          at::vec::map2<float>(
+              [](fVec x, fVec y) { return x + y; },
+              curr_last_recurrent_state + m * v_head_size,
+              curr_last_recurrent_state + m * v_head_size,
+              kgv + m * v_head_size,
+              v_head_size);
+        }
+      }
+
+      // core_attn_out -> output
+      // output: [B, T, HV, EV]
+      // core_attn_out: [B, HV, padded_T, EV]
+      auto curr_out = out_ptr + h * oStrideH;
+      for (int64_t m = 0; m < seq_len; m++) {
+        at::vec::map<scalar_t>(
+            [](fVec x) { return x; }, curr_out + m * oStrideT, curr_core_attn_out + m * v_head_size, v_head_size);
+      }
+
+      // Move to the next query
+      data_index_step(b, batch_size, h, v_num_head);
+    }
+  });
+}
+
+inline float softplus(float x, double threshold = 20.0) {
+  if (x > threshold)
+    return x;
+  else if (x < -threshold)
+    return std::exp(x);
+  else
+    return std::log1p(std::exp(x));
+}
+
+inline at::vec::Vectorized<float> softplus(const at::vec::Vectorized<float>& x, double threshold = 20.0) {
+  using Vec = at::vec::Vectorized<float>;
+  Vec mask_hi = x > Vec(threshold);
+  Vec mask_lo = x < Vec(-threshold);
+
+  Vec expx = x.exp_u20();
+  Vec log1pex = (expx + Vec(1.0f)).log();
+
+  return Vec::blendv(Vec::blendv(log1pex, expx, mask_lo), x, mask_hi);
+}
+
+template <typename scalar_t, typename param_t>
+void fused_sigmoid_gating_delta_rule_update_kernel_impl(
+    const scalar_t* __restrict__ q_ptr,
+    const scalar_t* __restrict__ k_ptr,
+    const scalar_t* __restrict__ v_ptr,
+    const param_t* __restrict__ A_log_ptr,
+    const scalar_t* __restrict__ a_ptr,
+    const scalar_t* __restrict__ dt_bias_ptr,
+    const scalar_t* __restrict__ b_ptr,
+    const int32_t* __restrict__ indices_ptr,
+    float* __restrict__ state_ptr,
+    scalar_t* __restrict__ o_ptr,
+    float* __restrict__ qk_scale_buf,
+    int64_t seq_len,
+    int64_t batch_size,
+    int64_t num_heads,
+    int64_t head_dim,
+    int64_t v_num_heads,
+    int64_t v_head_dim,
+    int64_t q_strideB,
+    int64_t q_strideS,
+    int64_t q_strideH,
+    int64_t k_strideB,
+    int64_t k_strideS,
+    int64_t k_strideH,
+    int64_t v_strideB,
+    int64_t v_strideS,
+    int64_t v_strideH,
+    bool use_qk_l2norm_in_kernel,
+    double softplus_threshold) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+
+  constexpr int64_t VecSize = bVec::size();
+  constexpr int64_t fVecSize = fVec::size();
+  int64_t group_size = v_num_heads / num_heads;
+  double scale = 1 / std::sqrt(head_dim);
+  fVec scale_vec = fVec(scale);
+  if (use_qk_l2norm_in_kernel) {
+    float eps = 1e-5;
+    at::parallel_for(0, batch_size * seq_len * num_heads, 0, [&](int64_t begin, int64_t end) {
+      int64_t bi{0}, si{0}, ni{0};
+      data_index_init(begin, bi, batch_size, si, seq_len, ni, num_heads);
+      for (int64_t i = begin; i < end; ++i) {
+        float sum_q = float(0);
+        float sum_k = float(0);
+        fVec sum_q_fvec = fVec(float(0));
+        fVec sum_k_fvec = fVec(float(0));
+        int64_t q_offset = bi * q_strideB + si * q_strideS + ni * q_strideH;
+        int64_t k_offset = bi * k_strideB + si * k_strideS + ni * k_strideH;
+        int64_t q_scale_offset = bi * seq_len * num_heads + si * num_heads + ni;
+        int64_t k_scale_offset = q_scale_offset + batch_size * seq_len * num_heads;
+        int64_t d;
+#pragma GCC unroll 4
+        for (d = 0; d <= head_dim - VecSize; d += VecSize) {
+          bVec q_bvec = bVec::loadu(q_ptr + q_offset + d);
+          fVec q_fvec0, q_fvec1;
+          std::tie(q_fvec0, q_fvec1) = at::vec::convert_to_float(q_bvec);
+          sum_q_fvec += q_fvec0 * q_fvec0;
+          sum_q_fvec += q_fvec1 * q_fvec1;
+          bVec k_bvec = bVec::loadu(k_ptr + k_offset + d);
+          fVec k_fvec0, k_fvec1;
+          std::tie(k_fvec0, k_fvec1) = at::vec::convert_to_float(k_bvec);
+          sum_k_fvec += k_fvec0 * k_fvec0;
+          sum_k_fvec += k_fvec1 * k_fvec1;
+        }
+#pragma GCC unroll 4
+        for (; d < head_dim; ++d) {
+          float q_val = static_cast<float>(q_ptr[q_offset + d]);
+          sum_q += q_val * q_val;
+          float k_val = static_cast<float>(k_ptr[k_offset + d]);
+          sum_k += k_val * k_val;
+        }
+
+        sum_q += vec_reduce_sum(sum_q_fvec);
+        sum_k += vec_reduce_sum(sum_k_fvec);
+        qk_scale_buf[q_scale_offset] = float(1) / std::sqrt(sum_q + eps);
+        qk_scale_buf[k_scale_offset] = float(1) / std::sqrt(sum_k + eps);
+
+        data_index_step(bi, batch_size, si, seq_len, ni, num_heads);
+      }
+    });
+  }
+  at::parallel_for(0, batch_size * seq_len * v_num_heads, 0, [&](int64_t begin, int64_t end) {
+    int64_t bi{0}, si{0}, ni{0};
+    data_index_init(begin, bi, batch_size, si, seq_len, ni, v_num_heads);
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t cache_index = indices_ptr[bi];
+      int64_t state_offset = (cache_index * v_num_heads + ni) * head_dim * v_head_dim;
+      float g_val = -std::exp(float(A_log_ptr[ni])) *
+                    softplus(float(a_ptr[bi * v_num_heads + ni]) + float(dt_bias_ptr[ni]), softplus_threshold);
+      float g_val_exp = std::exp(g_val);
+      fVec g_val_exp_vec = fVec(g_val_exp);
+      int64_t q_offset = si * q_strideS + bi * q_strideB + (ni / group_size) * q_strideH;
+      int64_t k_offset = si * k_strideS + bi * k_strideB + (ni / group_size) * k_strideH;
+      int64_t q_scale_offset = bi * seq_len * num_heads + si * num_heads + (ni / group_size);
+      int64_t k_scale_offset = q_scale_offset + batch_size * seq_len * num_heads;
+      float q_scale = use_qk_l2norm_in_kernel ? qk_scale_buf[q_scale_offset] : 1.0f;
+      float k_scale = use_qk_l2norm_in_kernel ? qk_scale_buf[k_scale_offset] : 1.0f;
+      int64_t v_offset = si * v_strideS + bi * v_strideB + ni * v_strideH;
+      int64_t o_offset = ((bi * seq_len + si) * v_num_heads + ni) * v_head_dim;
+      float beta_val = 1 / (1 + std::exp(-b_ptr[ni]));
+      fVec beta_vec = fVec(beta_val);
+      int64_t dvi = 0;
+      for (; dvi <= v_head_dim - VecSize; dvi += VecSize) {
+        fVec kv_mem_vec0 = fVec(float(0));
+        fVec kv_mem_vec1 = fVec(float(0));
+        for (int di = 0; di < head_dim; ++di) {
+          fVec k_val_vec = fVec(k_ptr[k_offset + di] * k_scale);
+          fVec state_vec0 = fVec::loadu(state_ptr + state_offset + di * v_head_dim + dvi);
+          fVec state_vec1 = fVec::loadu(state_ptr + state_offset + di * v_head_dim + dvi + fVecSize);
+          kv_mem_vec0 = kv_mem_vec0 + state_vec0 * g_val_exp_vec * k_val_vec;
+          kv_mem_vec1 = kv_mem_vec1 + state_vec1 * g_val_exp_vec * k_val_vec;
+        }
+        bVec v_bvec = bVec::loadu(v_ptr + v_offset + dvi);
+        fVec v_vec0, v_vec1;
+        std::tie(v_vec0, v_vec1) = at::vec::convert_to_float(v_bvec);
+        fVec dt_vec0 = (v_vec0 - kv_mem_vec0) * beta_vec;
+        fVec dt_vec1 = (v_vec1 - kv_mem_vec1) * beta_vec;
+        fVec o_vec0 = fVec(float(0));
+        fVec o_vec1 = fVec(float(0));
+        for (int di = 0; di < head_dim; ++di) {
+          fVec q_vec = fVec(q_ptr[q_offset + di] * q_scale);
+          fVec k_vec = fVec(k_ptr[k_offset + di] * k_scale);
+          fVec state_vec0 = fVec::loadu(state_ptr + state_offset + di * v_head_dim + dvi);
+          fVec state_vec1 = fVec::loadu(state_ptr + state_offset + di * v_head_dim + dvi + fVecSize);
+          state_vec0 = state_vec0 * g_val_exp_vec + k_vec * dt_vec0;
+          state_vec1 = state_vec1 * g_val_exp_vec + k_vec * dt_vec1;
+          o_vec0 = o_vec0 + state_vec0 * q_vec * scale_vec;
+          o_vec1 = o_vec1 + state_vec1 * q_vec * scale_vec;
+          state_vec0.store(state_ptr + state_offset + di * v_head_dim + dvi);
+          state_vec1.store(state_ptr + state_offset + di * v_head_dim + dvi + fVecSize);
+        }
+        bVec o_vec = at::vec::convert_from_float<scalar_t>(o_vec0, o_vec1);
+        o_vec.store(o_ptr + o_offset + dvi);
+      }
+      for (; dvi < v_head_dim; ++dvi) {
+        float kv_mem_val = 0;
+        for (int di = 0; di < head_dim; ++di) {
+          float k_val = k_ptr[k_offset + di] * k_scale;
+          state_ptr[state_offset + di * v_head_dim + dvi] *= g_val_exp;
+          kv_mem_val += state_ptr[state_offset + di * v_head_dim + dvi] * k_val;
+        }
+        float v_val = v_ptr[v_offset + dvi];
+        float dt_val = (v_val - kv_mem_val) * beta_val;
+        float o_val = 0;
+        for (int di = 0; di < head_dim; ++di) {
+          float q_val = q_ptr[q_offset + di] * q_scale;
+          float k_val = k_ptr[k_offset + di] * k_scale;
+          state_ptr[state_offset + di * v_head_dim + dvi] += k_val * dt_val;
+          o_val += state_ptr[state_offset + di * v_head_dim + dvi] * q_val * scale;
+        }
+        o_ptr[o_offset + dvi] = o_val;
+      }
+      data_index_step(bi, batch_size, si, seq_len, ni, v_num_heads);
+    }
+  });
+}
+
+template <typename scalar_t>
+void fused_gdn_gating_kernel_impl(
+    float* __restrict__ A_log,
+    const scalar_t* __restrict__ a,
+    const scalar_t* __restrict__ b,
+    const scalar_t* __restrict__ dt_bias,
+    float* __restrict__ out,
+    scalar_t* __restrict__ beta,
+    int64_t batch,
+    int64_t num_heads) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int vec_size = bVec::size();
+  constexpr int fvec_size = fVec::size();
+  const fVec neg_one(-1.0f);
+  const fVec one(1.0f);
+  at::parallel_for(0, batch, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t j = 0;
+      for (; j < num_heads - (num_heads % vec_size); j += vec_size) {
+        fVec A_log_vec0 = fVec::loadu(A_log + j);
+        fVec A_log_vec1 = fVec::loadu(A_log + j + fvec_size);
+        bVec dt_bias_vec = bVec::loadu(dt_bias + j);
+        bVec a_bvec = bVec::loadu(a + i * num_heads + j);
+        bVec b_bvec = bVec::loadu(b + i * num_heads + j);
+        fVec a0, a1, dt_bias_vec0, dt_bias_vec1, b0, b1;
+        std::tie(a0, a1) = at::vec::convert_to_float(a_bvec);
+        std::tie(b0, b1) = at::vec::convert_to_float(b_bvec);
+        std::tie(dt_bias_vec0, dt_bias_vec1) = at::vec::convert_to_float(dt_bias_vec);
+
+        fVec g0 = neg_one * A_log_vec0.exp_u20() * softplus(a0 + dt_bias_vec0);
+        fVec g1 = neg_one * A_log_vec1.exp_u20() * softplus(a1 + dt_bias_vec1);
+        fVec beta0 = one / (one + (neg_one * b0).exp_u20());
+        fVec beta1 = one / (one + (neg_one * b1).exp_u20());
+
+        g0.store(out + i * num_heads + j);
+        g1.store(out + i * num_heads + j + fvec_size);
+        bVec beta_vec = at::vec::convert_from_float<scalar_t>(beta0, beta1);
+        beta_vec.store(beta + i * num_heads + j);
+      }
+      for (; j < num_heads; ++j) {
+        out[i * num_heads + j] = -std::exp(A_log[j]) * softplus(float(a[i * num_heads + j]) + float(dt_bias[j]));
+        beta[i * num_heads + j] = 1 / (1 + std::exp(-b[i * num_heads + j]));
+      }
+    }
+  });
+}
+
+template <typename scalar_t>
+void fused_gdn_gating_kernel_impl(
+    scalar_t* __restrict__ A_log,
+    const scalar_t* __restrict__ a,
+    const scalar_t* __restrict__ b,
+    const scalar_t* __restrict__ dt_bias,
+    float* __restrict__ out,
+    scalar_t* __restrict__ beta,
+    int64_t batch,
+    int64_t num_heads) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int vec_size = bVec::size();
+  constexpr int fvec_size = fVec::size();
+  const fVec neg_one(-1.0f);
+  const fVec one(1.0f);
+  at::parallel_for(0, batch, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t j = 0;
+      for (; j < num_heads - (num_heads % vec_size); j += vec_size) {
+        bVec A_log_bvec = bVec::loadu(A_log + j);
+        fVec A_log_vec0, A_log_vec1;
+        std::tie(A_log_vec0, A_log_vec1) = at::vec::convert_to_float(A_log_bvec);
+        bVec dt_bias_vec = bVec::loadu(dt_bias + j);
+        bVec a_bvec = bVec::loadu(a + i * num_heads + j);
+        bVec b_bvec = bVec::loadu(b + i * num_heads + j);
+        fVec a0, a1, dt_bias_vec0, dt_bias_vec1, b0, b1;
+        std::tie(a0, a1) = at::vec::convert_to_float(a_bvec);
+        std::tie(b0, b1) = at::vec::convert_to_float(b_bvec);
+        std::tie(dt_bias_vec0, dt_bias_vec1) = at::vec::convert_to_float(dt_bias_vec);
+
+        fVec g0 = neg_one * A_log_vec0.exp_u20() * softplus(a0 + dt_bias_vec0);
+        fVec g1 = neg_one * A_log_vec1.exp_u20() * softplus(a1 + dt_bias_vec1);
+        fVec beta0 = one / (one + (neg_one * b0).exp_u20());
+        fVec beta1 = one / (one + (neg_one * b1).exp_u20());
+
+        g0.store(out + i * num_heads + j);
+        g1.store(out + i * num_heads + j + fvec_size);
+        bVec beta_vec = at::vec::convert_from_float<scalar_t>(beta0, beta1);
+        beta_vec.store(beta + i * num_heads + j);
+      }
+      for (; j < num_heads; ++j) {
+        out[i * num_heads + j] = -std::exp(float(A_log[j])) * softplus(float(a[i * num_heads + j]) + float(dt_bias[j]));
+        beta[i * num_heads + j] = 1 / (1 + std::exp(-b[i * num_heads + j]));
+      }
+    }
+  });
+}
+
+}  // anonymous namespace
+
+template <bool is_last_dim_contiguous>
+inline void
+CHECK_INPUT_SHAPE_DTYPE(const at::Tensor& tensor, const int64_t& dim, const at::IntArrayRef& sizes, at::ScalarType st) {
+  TORCH_CHECK(tensor.sizes() == sizes, "Input tensor shape mismatch: expected ", sizes, ", got ", tensor.sizes());
+  TORCH_CHECK(tensor.dtype() == st, "Input tensor dtype mismatch");
+  CHECK_DIM(dim, tensor);
+  if (is_last_dim_contiguous) {
+    CHECK_LAST_DIM_CONTIGUOUS_INPUT(tensor);
+  } else {
+    CHECK_CONTIGUOUS(tensor);
+  }
+}
+
+// query: [B, T, HK, EK]
+// key: [B, T, HK, EK]
+// value: [B, T, HV, EV]
+// g: [B, T, HV] FP32
+// beta: [B, T, HV]
+// initial_state: [N, HV, EK, EV] FP32
+// output_final_state: bool
+// cu_seqlens: [N + 1] INT32
+// head_first: bool
+// use_qk_l2norm_in_kernel: bool
+std::tuple<at::Tensor, at::Tensor> chunk_gated_delta_rule_cpu(
+    const at::Tensor& query,
+    const at::Tensor& key,
+    const at::Tensor& value,
+    const at::Tensor& g,
+    const at::Tensor& beta,
+    const at::Tensor& initial_state,
+    bool output_final_state,
+    const at::Tensor& cu_seqlens,
+    bool head_first,
+    bool use_qk_l2norm_in_kernel,
+    double eps = 1e-5) {
+  TORCH_CHECK(head_first == false, "chunk_gated_delta_rule_cpu does not support head first");
+  int64_t B = query.size(0);
+  int64_t global_seq_len = query.size(1);
+  int64_t qk_num_head = query.size(2);
+  int64_t qk_head_size = query.size(3);
+  int64_t v_num_head = value.size(2);
+  int64_t v_head_size = value.size(3);
+  int64_t batch_size = initial_state.size(0);
+  CHECK_EQ(B, 1);
+  TORCH_CHECK(v_num_head % qk_num_head == 0, "expect v_num_head multiple of qk_num_head.");
+  TORCH_CHECK(qk_head_size % 32 == 0, "expect qk_head_size to be multiples of 32.");
+  TORCH_CHECK(v_head_size % 32 == 0, "expect v_head_size to be multiples of 32.");
+  CHECK_INPUT_SHAPE_DTYPE<true>(query, 4, {B, global_seq_len, qk_num_head, qk_head_size}, at::kBFloat16);
+  CHECK_INPUT_SHAPE_DTYPE<true>(key, 4, {B, global_seq_len, qk_num_head, qk_head_size}, at::kBFloat16);
+  CHECK_INPUT_SHAPE_DTYPE<true>(value, 4, {B, global_seq_len, v_num_head, v_head_size}, at::kBFloat16);
+  CHECK_INPUT_SHAPE_DTYPE<false>(g, 3, {B, global_seq_len, v_num_head}, at::kFloat);
+  CHECK_INPUT_SHAPE_DTYPE<false>(beta, 3, {B, global_seq_len, v_num_head}, at::kBFloat16);
+  CHECK_INPUT_SHAPE_DTYPE<false>(cu_seqlens, 1, {batch_size + 1}, at::kInt);
+  CHECK_INPUT_SHAPE_DTYPE<false>(initial_state, 4, {batch_size, v_num_head, qk_head_size, v_head_size}, at::kFloat);
+
+  at::Tensor output = at::empty_like(value, value.options());  // [B, T, HV, EV]
+  at::Tensor final_state = initial_state.to(at::kFloat);       // [N, HV, EK, EV]
+
+  // Strides
+  int64_t qStrideH = query.stride(2);
+  int64_t qStrideT = query.stride(1);
+  int64_t kStrideH = key.stride(2);
+  int64_t kStrideT = key.stride(1);
+  int64_t vStrideH = value.stride(2);
+  int64_t vStrideT = value.stride(1);
+  int64_t oStrideH = output.stride(2);
+  int64_t oStrideT = output.stride(1);
+
+  constexpr int64_t chunk_size = 64;
+  // Deduce the global chunks
+  // e.g. cu_seqlens: [0, 5, 13, 16], chunk_size = 4
+  // chunk_offsets: [0, 2, 4, 5]
+  // chunk_indices (batch_id, local_chunk_id): [[0, 0], [0, 1], [1, 0], [1, 1], [2, 0]]
+  at::Tensor chunk_offsets = at::empty(batch_size + 1, cu_seqlens.options());
+  auto chunk_offsets_ptr = chunk_offsets.data_ptr<int32_t>();
+  chunk_offsets_ptr[0] = 0;
+  int32_t* cu_seqlens_ptr = cu_seqlens.data_ptr<int32_t>();
+  int64_t s = 0;
+  int64_t e = 0;
+  int64_t s_pad = 0;
+  int64_t e_pad = 0;
+  for (int64_t b = 0; b < batch_size; b++) {
+    e = cu_seqlens_ptr[b + 1];
+    int64_t seq_len = e - s;
+    int64_t pad_size = (chunk_size - seq_len % chunk_size) % chunk_size;
+    int64_t total_seq_length = seq_len + pad_size;
+    e_pad = s_pad + total_seq_length;
+    chunk_offsets[b + 1] = e_pad / chunk_size;
+    s = e;
+    s_pad = e_pad;
+  }
+  int64_t global_total_seq_length = e_pad;
+  int64_t global_num_chunk = chunk_offsets_ptr[batch_size];
+  at::Tensor chunk_indices = at::empty(global_num_chunk * 2, cu_seqlens.options());
+  auto chunk_indices_ptr = chunk_indices.data_ptr<int32_t>();
+  int64_t curr_c = 0;
+  for (int64_t b = 0; b < batch_size; b++) {
+    int64_t batch_chunk_num = chunk_offsets_ptr[b + 1] - chunk_offsets_ptr[b];
+    for (int64_t c = 0; c < batch_chunk_num; c++) {
+      chunk_indices_ptr[curr_c * 2] = b;
+      chunk_indices_ptr[curr_c * 2 + 1] = c;
+      curr_c += 1;
+    }
+  }
+
+  // Allocate buffer
+  int64_t buff_size = v_num_head * global_total_seq_length                               // g_pad_data
+                      + batch_size * v_num_head * global_total_seq_length * v_head_size  // core_attn
+                      + v_num_head * global_total_seq_length * chunk_size                // decay_mask
+                      + v_num_head * global_total_seq_length * v_head_size;              // v_beta_attn
+  at::Tensor buff_data = at::empty({buff_size}, query.options().dtype(at::kFloat));
+  int64_t reduced_buff_size = qk_num_head * global_total_seq_length * qk_head_size    // q_pad_data
+                              + qk_num_head * global_total_seq_length * qk_head_size  // k_pad_data
+                              + v_num_head * global_total_seq_length * v_head_size    // v_pad_data
+                              + v_num_head * global_total_seq_length * qk_head_size   // k_beta_data
+                              + v_num_head * global_total_seq_length * v_head_size    // v_beta_data
+                              + v_num_head * global_total_seq_length * qk_head_size   // k_cumdecay_reduced
+                              + qk_num_head * global_seq_len                          // q_norm_sum
+                              + qk_num_head * global_seq_len;                         // k_norm_sum
+  at::Tensor reduced_buff_data = at::empty({reduced_buff_size}, query.options());
+  int64_t num_thread = at::get_num_threads();
+  int64_t buff_size_16bit_per_thread =
+      /* k_transpose */ qk_head_size * chunk_size +
+      /* v_pack */ chunk_size * v_head_size +
+      /* k_beta_g  */ chunk_size * qk_head_size +
+      /* k_beta_g_pack  */ chunk_size * qk_head_size +
+      /* attn */ chunk_size * chunk_size * 2 +
+      /* attn_reduced */ chunk_size * chunk_size +
+      /* k_cumdecay */ chunk_size * qk_head_size * 2 +
+      /* row */ chunk_size * 2 +
+      /* updated */ chunk_size * 2 +
+      /* curr_last_recurrent_state_reduced  */ qk_head_size * v_head_size +
+      /* curr_last_recurrent_state_pack_reduced   */ qk_head_size * v_head_size +
+      /* k_transpose_i  */ qk_head_size * chunk_size +
+      /* attn_i   */ chunk_size * chunk_size * 2 +
+      /* attn_i_reduced     */ chunk_size * chunk_size +
+      /* v_prime */ chunk_size * v_head_size * 2 +
+      /* v_prime_reduced */ chunk_size * v_head_size +
+      /* v_prime_pack_reduced */ chunk_size * v_head_size +
+      /* qg */ chunk_size * qk_head_size +
+      /* attn_inter */ chunk_size * v_head_size * 2 +
+      /* kg */ chunk_size * qk_head_size +
+      /* kg_transpose */ qk_head_size * chunk_size +
+      /* kgv */ qk_head_size * v_head_size * 2;
+  at::Tensor thread_buff_data = at::empty({num_thread, buff_size_16bit_per_thread}, query.options());
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(query.scalar_type(), "chunk_gated_delta_rule_kernel", [&] {
+    chunk_gated_delta_rule_kernel_impl<scalar_t, chunk_size>(
+        output.data_ptr<scalar_t>(),
+        final_state.data_ptr<float>(),
+        query.data_ptr<scalar_t>(),
+        key.data_ptr<scalar_t>(),
+        value.data_ptr<scalar_t>(),
+        g.data_ptr<float>(),
+        beta.data_ptr<scalar_t>(),
+        cu_seqlens_ptr,
+        buff_data.data_ptr<float>(),
+        reduced_buff_data.data_ptr<scalar_t>(),
+        thread_buff_data.data_ptr<scalar_t>(),
+        chunk_offsets_ptr,
+        chunk_indices_ptr,
+        use_qk_l2norm_in_kernel,
+        batch_size,
+        global_seq_len,
+        qk_num_head,
+        v_num_head,
+        qk_head_size,
+        v_head_size,
+        qStrideH,
+        qStrideT,
+        kStrideH,
+        kStrideT,
+        vStrideH,
+        vStrideT,
+        oStrideH,
+        oStrideT,
+        global_total_seq_length,
+        global_num_chunk,
+        buff_size_16bit_per_thread,
+        eps);
+  });
+  return std::make_tuple(std::move(output), std::move(final_state));
+}
+
+// A_log: [v_num_heads]
+// dt_bias: [v_num_heads]
+// query: [seq_len, batch_size, num_heads, head_dim]
+// key: [seq_len, batch_size, num_heads, head_dim]
+// value: [seq_len, batch_size, v_num_heads, v_head_dim]
+// a: [batch_size, v_num_heads]
+// b: [batch_size, v_num_heads]
+// initial_state_source:[num_tokens, v_num_heads, head_dim, v_head_dim]
+// initial_state_indices: [batch_size]
+// cu_seqlens: [batch_size + 1]
+at::Tensor fused_sigmoid_gating_delta_rule_update_cpu(
+    const at::Tensor& A_log,
+    const at::Tensor& dt_bias,
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& a,
+    const at::Tensor& b,
+    at::Tensor& initial_state_source,
+    const at::Tensor& initial_state_indices,
+    const at::Tensor& cu_seqlens,
+    bool use_qk_l2norm_in_kernel,
+    double softplus_beta = 1.0,
+    double softplus_threshold = 20.0) {
+  CHECK_DIM(4, q);
+  CHECK_DIM(4, v);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(q);
+  int64_t seq_len = q.size(0);
+  int64_t batch_size = q.size(1);
+  int64_t num_heads = q.size(2);
+  int64_t head_dim = q.size(3);
+  int64_t v_num_heads = v.size(2);
+  int64_t v_head_dim = v.size(3);
+  CHECK_INPUT_SHAPE_DTYPE<true>(k, {seq_len, batch_size, num_heads, head_dim}, q.scalar_type());
+  CHECK_INPUT_SHAPE_DTYPE<true>(v, {seq_len, batch_size, v_num_heads, v_head_dim}, q.scalar_type());
+  CHECK_INPUT_SHAPE_DTYPE<true>(a, {batch_size, v_num_heads}, q.scalar_type());
+  CHECK_INPUT_SHAPE_DTYPE<true>(dt_bias, {v_num_heads}, q.scalar_type());
+  CHECK_INPUT_SHAPE_DTYPE<true>(b, {batch_size, v_num_heads}, q.scalar_type());
+  CHECK_INPUT_SHAPE_DTYPE<true>(initial_state_indices, {batch_size}, at::kInt);
+  CHECK_INPUT_SHAPE_DTYPE<true>(cu_seqlens, {batch_size + 1}, at::kInt);
+  CHECK_INPUT_SHAPE_DTYPE<true>(
+      initial_state_source, {initial_state_source.size(0), v_num_heads, head_dim, v_head_dim}, at::kFloat);
+  CHECK(initial_state_source.size(0) >= batch_size);
+  CHECK_EQ(v_num_heads % num_heads, 0);
+  TORCH_CHECK(
+      A_log.sizes() == at::IntArrayRef({v_num_heads}),
+      "Input tensor shape mismatch: expected ",
+      at::IntArrayRef({v_num_heads}),
+      ", got ",
+      A_log.sizes());
+
+  int64_t q_strideB = q.stride(1);
+  int64_t q_strideS = q.stride(0);
+  int64_t q_strideH = q.stride(2);
+  int64_t k_strideB = k.stride(1);
+  int64_t k_strideS = k.stride(0);
+  int64_t k_strideH = k.stride(2);
+  int64_t v_strideB = v.stride(1);
+  int64_t v_strideS = v.stride(0);
+  int64_t v_strideH = v.stride(2);
+  at::Tensor core_attn_out = at::empty({batch_size, seq_len, v_num_heads, v_head_dim}, q.options());
+  at::Tensor qk_scale_buf = at::empty({2 * batch_size, seq_len, num_heads}, at::kFloat);
+
+  CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT(
+      q.scalar_type(), A_log.scalar_type(), "fused_sigmoid_gating_delta_rule_update_kernel_impl", [&] {
+        fused_sigmoid_gating_delta_rule_update_kernel_impl<scalar_t, param_t>(
+            q.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(),
+            v.data_ptr<scalar_t>(),
+            A_log.data_ptr<param_t>(),
+            a.data_ptr<scalar_t>(),
+            dt_bias.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            initial_state_indices.data_ptr<int32_t>(),
+            initial_state_source.data_ptr<float>(),
+            core_attn_out.data_ptr<scalar_t>(),
+            qk_scale_buf.data_ptr<float>(),
+            seq_len,
+            batch_size,
+            num_heads,
+            head_dim,
+            v_num_heads,
+            v_head_dim,
+            q_strideB,
+            q_strideS,
+            q_strideH,
+            k_strideB,
+            k_strideS,
+            k_strideH,
+            v_strideB,
+            v_strideS,
+            v_strideH,
+            use_qk_l2norm_in_kernel,
+            softplus_threshold);
+      });
+  return core_attn_out;
+}
+
+// A_log: [num_v_heads]
+// a: [batch, num_v_heads]
+// b: [batch, num_v_heads]
+// dt_bias: [num_v_heads]
+// -A_log.float().exp() * F.softplus(a.float() + dt_bias)
+std::tuple<at::Tensor, at::Tensor>
+fused_gdn_gating_cpu(const at::Tensor& A_log, const at::Tensor& a, const at::Tensor& b, const at::Tensor& dt_bias) {
+  CHECK_DIM(1, A_log);
+  CHECK_DIM(2, a);
+  CHECK_DIM(2, b);
+  CHECK_DIM(1, dt_bias);
+  CHECK_CONTIGUOUS(a);
+  CHECK_EQ(A_log.size(0), a.size(1));
+  CHECK_EQ(A_log.size(0), dt_bias.size(0));
+  int batch = a.size(0);
+  int num_heads = a.size(1);
+  CHECK_EQ(b.size(0), batch);
+  CHECK_EQ(b.size(1), num_heads);
+  at::Tensor out = at::empty({1, batch, num_heads}, a.options().dtype(at::kFloat));
+  at::Tensor beta = at::empty({1, batch, num_heads}, b.options());
+  CPU_DISPATCH_REDUCED_FLOATING_TYPES_EXT(a.scalar_type(), A_log.scalar_type(), "fused_gdn_gating_kernel", [&] {
+    fused_gdn_gating_kernel_impl<scalar_t>(
+        A_log.data_ptr<param_t>(),
+        a.data_ptr<scalar_t>(),
+        b.data_ptr<scalar_t>(),
+        dt_bias.data_ptr<scalar_t>(),
+        out.data_ptr<float>(),
+        beta.data_ptr<scalar_t>(),
+        batch,
+        num_heads);
+  });
+  return std::make_tuple(out, beta);
+}
diff --git a/csrc/cpu/sgl-kernels/gemm.cpp b/csrc/cpu/sgl-kernels/gemm.cpp
index 65c56943c56f..38e6d9f4ce9d 100644
--- a/csrc/cpu/sgl-kernels/gemm.cpp
+++ b/csrc/cpu/sgl-kernels/gemm.cpp
@@ -1,11 +1,12 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
-#include "common.h"
-#include "vec.h"
+// clang-format off
+
 #include "gemm.h"
 
-// clang-format off
+#include "common.h"
+#include "vec.h"
 
 namespace {
 
@@ -26,13 +27,13 @@ inline void s8s8_compensation(int8_t* __restrict__ packed, int K) {
   const __m512i off = _mm512_set1_epi8(static_cast<char>(0x80));
   for (int k = 0; k < K / 4; ++k) {
     for (int col = 0; col < COLS; ++col) {
-      __m512i vb = _mm512_loadu_si512((const __m512i *)(packed + k * BLOCK_N * 4 + col * 64));
+      __m512i vb = _mm512_loadu_si512((const __m512i*)(packed + k * BLOCK_N * 4 + col * 64));
       vcomp[col] = _mm512_dpbusd_epi32(vcomp[col], off, vb);
     }
   }
 
   for (int col = 0; col < COLS; ++col) {
-    _mm512_storeu_si512((__m512i *)(packed + offset + col * 64), vcomp[col]);
+    _mm512_storeu_si512((__m512i*)(packed + offset + col * 64), vcomp[col]);
   }
 #else
   TORCH_CHECK(false, "s8s8_compensation not implemented!");
@@ -69,6 +70,43 @@ inline void pack_vnni<int8_t>(int8_t* __restrict__ packed, const int8_t* __restr
   s8s8_compensation<BLOCK_N>(packed, K);
 }
 
+// uint8_t: mxfp4 or int4
+// pack to vnni2 format as they are computed with bfloat16
+//
+// from [N, K'/2, 2] to [K'/2, N, 2], view 2x int4 as unit8:
+// from [N,    K   ] to [K,    N   ] where K = K'/2
+//
+template <>
+inline void pack_vnni<uint8_t>(uint8_t* __restrict__ packed, const uint8_t* __restrict__ weight, int N, int K) {
+  constexpr int BLOCK_N = block_size_n();
+
+  uint8_t unpacked[2 * BLOCK_N];
+
+  // 32-way pack (align with BLOCK_N), faster for avx512 unpacking
+  //
+  // for a range of (64):
+  //   {0, 1, 2, ..., 63}
+  //
+  // original format:
+  //   { 1|0,  3|2, ..., 63|62}
+  //
+  // packed format:
+  //   {32|0, 31|1, ..., 63|31}
+  //
+  for (int k = 0; k < K; ++k) {
+    // unpack first
+    for (int n = 0; n < N; ++n) {
+      uint8_t value = weight[n * K + k];
+      unpacked[n * 2 + 0] = value & 0xF;  // lower 4 bits
+      unpacked[n * 2 + 1] = value >> 4;   // higher 4 bits
+    }
+    // re-pack to 32-way
+    for (int n = 0; n < N; ++n) {
+      packed[k * N + n] = (unpacked[n + BLOCK_N] << 4) | unpacked[n];
+    }
+  }
+}
+
 template <typename scalar_t>
 inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
   using bVec = at::vec::Vectorized<scalar_t>;
@@ -76,7 +114,7 @@ inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ inpu
   constexpr int kVecSize = bVec::size();
 
   int64_t d;
-  #pragma GCC unroll 4
+#pragma GCC unroll 4
   for (d = 0; d <= size - kVecSize; d += kVecSize) {
     fVec data0 = fVec::loadu(input + d);
     fVec data1 = fVec::loadu(input + d + fVec::size());
@@ -89,13 +127,34 @@ inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ inpu
 }
 
 template <typename scalar_t>
-inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+inline void copy_stub(float* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
   using bVec = at::vec::Vectorized<scalar_t>;
   using fVec = at::vec::Vectorized<float>;
   constexpr int kVecSize = bVec::size();
 
   int64_t d;
-  #pragma GCC unroll 4
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0, data1;
+    bVec b_vec = bVec::loadu(input + d);
+    std::tie(data0, data1) = at::vec::convert_to_float(b_vec);
+    data0.store(out + d);
+    data1.store(out + d + fVec::size());
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<float>(input[d]);
+  }
+}
+
+template <typename scalar_t>
+inline void copy_add_stub(
+    scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+
+  int64_t d;
+#pragma GCC unroll 4
   for (d = 0; d <= size - kVecSize; d += kVecSize) {
     fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
     fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
@@ -107,11 +166,51 @@ inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__
   }
 }
 
+template <typename scalar_t, bool has_bias>
+inline void scalar_sigmoid_and_mul(
+    scalar_t* __restrict__ out,
+    const float* __restrict__ input,
+    const float* __restrict__ bias,
+    const scalar_t* __restrict__ mul,
+    int SIZE) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  // scalar sigmoid
+  const fVec one = fVec(1.f);
+  fVec X;
+  if constexpr (has_bias) {
+    assert(bias != nullptr);
+    X = fVec(input[0] + bias[0]);
+  } else {
+    X = fVec(input[0]);
+  }
+  X = one / (one + X.neg().exp_u20());
+
+  // vec mul
+  constexpr int kVecSize = bVec::size();
+  for (int d = 0; d < SIZE; d += kVecSize) {
+    bVec m_bvec = bVec::loadu(mul + d);
+    fVec m_fvec0, m_fvec1;
+    std::tie(m_fvec0, m_fvec1) = at::vec::convert_to_float(m_bvec);
+    m_fvec0 = m_fvec0 * X;
+    m_fvec1 = m_fvec1 * X;
+
+    bVec out_vec = convert_from_float_ext<scalar_t>(m_fvec0, m_fvec1);
+    out_vec.store(out + d);
+  }
+}
+
 template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn {
   static inline void apply(
-      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
-      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -120,9 +219,14 @@ struct tinygemm_kernel_nn {
 template <bool has_bias, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, at::BFloat16* __restrict__ C,
-      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
 
@@ -145,7 +249,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
 
     const int64_t K2 = K >> 1;
     const int64_t lda2 = lda >> 1;
-    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
     const float* a_ptr = reinterpret_cast<const float*>(A);
     const float* b_ptr = reinterpret_cast<const float*>(B);
 
@@ -180,9 +284,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
               (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
         }
       } else {
-        _mm256_storeu_si256(
-            reinterpret_cast<__m256i*>(C + row * ldc + col * 16),
-            (__m256i)(_mm512_cvtneps_pbh(vc[i])));
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(C + row * ldc + col * 16), (__m256i)(_mm512_cvtneps_pbh(vc[i])));
       }
     };
     Unroll<ROWS * COLS>{}(storec);
@@ -190,22 +292,33 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
-    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
-        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
-        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                \
+  tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                          \
+      B + nb_start * 2,                                            \
+      C + mb_start * ldc + nb_start,                               \
+      has_bias ? bias + nb_start : nullptr,                        \
+      K,                                                           \
+      lda,                                                         \
+      ldb,                                                         \
+      ldc);
 
 template <typename scalar_t, bool has_bias>
 struct brgemm {
   static inline void apply(
-      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
-      float* __restrict__ Ctmp, const float* __restrict__ bias,
-      int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int BLOCK_N = block_size_n();
-    at::native::cpublas::brgemm(
-        M, N, K, lda, ldb, BLOCK_N, /* add_C */false,
-        A, B, Ctmp);
+    at::native::cpublas::brgemm(M, N, K, lda, ldb, BLOCK_N, /* add_C */ false, A, B, Ctmp);
 
     // copy from Ctmp to C
     for (int64_t m = 0; m < M; ++m) {
@@ -216,6 +329,21 @@ struct brgemm {
       }
     }
   }
+  static inline void apply(
+      const float* __restrict__ A,
+      const float* __restrict__ B,
+      scalar_t* __restrict__ C,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      int64_t M,
+      int64_t N,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(M, N, K, lda, ldb, BLOCK_N, /* add_C */ false, A, B, Ctmp);
+  }
 };
 
 template <typename scalar_t, bool has_bias>
@@ -232,15 +360,12 @@ void tinygemm_kernel(
     int64_t ldb,
     int64_t ldc,
     bool brg) {
-
   if (brg) {
-    brgemm<scalar_t, has_bias>::apply(
-        A, B, C, Ctmp, bias,
-        M, N, K, lda, ldb, ldc);
+    brgemm<scalar_t, has_bias>::apply(A, B, C, Ctmp, bias, M, N, K, lda, ldb, ldc);
     return;
   }
 
-  // pattern: 1-4-16
+  // pattern: 1-4-16, N = 16, 32, 48, 64
   constexpr int64_t BLOCK_M = 4;
   constexpr int64_t BLOCK_N = 64;
   const int64_t MB = div_up(M, BLOCK_M);
@@ -252,25 +377,88 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
+      switch (mb_size << 4 | nb_size >> 4) {
         // mb_size = 1
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
-        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        case 0x11:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 16);
+          break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x13:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 48);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
         // mb_size = 2
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
-        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        case 0x21:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 16);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x23:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 48);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 64);
+          break;
         // mb_size = 3
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
-        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        case 0x31:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 16);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x33:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 48);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 64);
+          break;
         // mb_size = 4
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+        case 0x41:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 16);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        case 0x43:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 48);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 64);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, " x ", nb_size);
       }
     }
   }
 }
 
+template <typename scalar_t, bool has_bias>
+void tinygemm_kernel(
+    const float* __restrict__ A,
+    const float* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    const float* __restrict__ bias,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  TORCH_CHECK(brg, "Expected to use fp32 brgemm for small N GEMM");
+  if (brg) {
+    brgemm<scalar_t, has_bias>::apply(A, B, C, Ctmp, bias, M, N, K, lda, ldb, ldc);
+    return;
+  }
+  // TODO : add intrinsic path
+}
+
 template <typename scalar_t>
 void weight_packed_linear_kernel_impl(
     scalar_t* __restrict__ out,
@@ -282,29 +470,20 @@ void weight_packed_linear_kernel_impl(
     int64_t K,
     int64_t mat1_strideM,
     int64_t out_strideM) {
-
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
   const int64_t MB = div_up(M, BLOCK_M);
   const int64_t NB = div_up(N, BLOCK_N);
 
-  // use avx512-bf16 when a) M is small; b) dtype is bfloat16, otherwise use amx
-  const bool use_brgemm = (M > 4) || (!std::is_same_v<scalar_t, at::BFloat16>);
-
-  // l2 cache block for n
-  int64_t cache_blocks_nb = get_cache_blocks<scalar_t>(BLOCK_N, K);
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
 
   // parallel on [MB, NB]
   AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
-    parallel_2d(MB, NB, [&](int64_t begin_mb, int64_t end_mb, int64_t begin_nb, int64_t end_nb) {
-
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
       // for brgemm, use float32 for accumulate
       alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
 
-      for (int64_t nbb = begin_nb; nbb < end_nb; nbb += cache_blocks_nb) {
-      for (int64_t mb = begin_mb; mb < end_mb; ++mb) {
-      for (int64_t nb = nbb; nb < std::min(nbb + cache_blocks_nb, end_nb); ++nb) {
-
+      loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
         int64_t mb_start = mb * BLOCK_M;
         int64_t mb_size = std::min(M - mb_start, BLOCK_M);
         int64_t nb_start = nb * BLOCK_N;
@@ -323,7 +502,82 @@ void weight_packed_linear_kernel_impl(
             /* ldb */ nb_size,
             /* ldc */ out_strideM,
             /* brg */ use_brgemm);
-      }}}
+      });
+
+      if (use_brgemm) {
+        at::native::cpublas::brgemm_release();
+      }
+    });
+  });
+}
+
+template <typename scalar_t>
+void weight_packed_linear_kernel_impl(
+    scalar_t* __restrict__ out,
+    const scalar_t* __restrict__ mat1,
+    const float* __restrict__ mat2,
+    const float* __restrict__ bias,
+    const scalar_t* __restrict__ post_mul_mat,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t mat1_strideM,
+    int64_t out_strideM) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  const bool use_brgemm = true;  // TODO: add intrinsic path
+  // parallel on [MB, NB]
+  AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+      // for brgemm, use float32 for accumulate
+      alignas(64) float Atmp[BLOCK_M * K];
+      alignas(64) float Ctmp[BLOCK_M * BLOCK_N];
+
+      loop_2d<float>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+        int64_t mb_start = mb * BLOCK_M;
+        int64_t mb_size = std::min(M - mb_start, BLOCK_M);
+        int64_t nb_start = nb * BLOCK_N;
+        int64_t nb_size = std::min(N - nb_start, BLOCK_N);
+        for (int64_t m = 0; m < mb_size; ++m) {
+          copy_stub<scalar_t>(Atmp + m * K, mat1 + mb_start * mat1_strideM + m * K, K);
+        }
+        tinygemm_kernel<scalar_t, has_bias>(
+            /*   A */ Atmp,
+            /*   B */ mat2 + nb_start * K /* nb * BLOCK_N * K */,
+            /*   C */ out + mb_start * out_strideM + nb_start,
+            /* Ctmp*/ Ctmp,
+            /* bias*/ bias + nb_start,
+            /*   M */ mb_size,
+            /*   N */ nb_size,
+            /*   K */ K,
+            /* lda */ mat1_strideM,
+            /* ldb */ nb_size,
+            /* ldc */ out_strideM,
+            /* brg */ use_brgemm);
+
+        if (post_mul_mat != nullptr) {
+          for (int64_t m = 0; m < mb_size; ++m) {
+            scalar_sigmoid_and_mul<scalar_t, has_bias>(
+                out + mb_start * out_strideM + nb_start + m * out_strideM,
+                Ctmp + m * BLOCK_N,
+                bias + nb_start,
+                post_mul_mat + mb_start * out_strideM + m * out_strideM,
+                out_strideM);
+          }
+        } else {
+          for (int64_t m = 0; m < mb_size; ++m) {
+            if constexpr (has_bias) {
+              copy_add_stub(
+                  out + mb_start * out_strideM + nb_start + m * out_strideM, Ctmp + m * BLOCK_N, bias + nb_start, N);
+            } else {
+              copy_stub(out + mb_start * out_strideM + nb_start + m * out_strideM, Ctmp + m * BLOCK_N, N);
+            }
+          }
+        }
+      });
 
       if (use_brgemm) {
         at::native::cpublas::brgemm_release();
@@ -332,20 +586,38 @@ void weight_packed_linear_kernel_impl(
   });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // tinygemm interface
 template <typename scalar_t>
-void tinygemm_kernel(const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, scalar_t* __restrict__ C,
-    float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const scalar_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    float* __restrict__ Ctmp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
   tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, nullptr, M, N, K, lda, ldb, ldc, brg);
 }
 
-#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                             \
-    template void tinygemm_kernel<TYPE>(                                                \
-        const TYPE* __restrict__ A, const TYPE* __restrict__ B, TYPE* __restrict__ C,   \
-        float* __restrict__ Ctmp, int64_t M, int64_t N, int64_t K, int64_t lda,         \
-        int64_t ldb, int64_t ldc, bool brg)
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE) \
+  template void tinygemm_kernel<TYPE>(      \
+      const TYPE* __restrict__ A,           \
+      const TYPE* __restrict__ B,           \
+      TYPE* __restrict__ C,                 \
+      float* __restrict__ Ctmp,             \
+      int64_t M,                            \
+      int64_t N,                            \
+      int64_t K,                            \
+      int64_t lda,                          \
+      int64_t ldb,                          \
+      int64_t ldc,                          \
+      bool brg)
 
 INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
 INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
@@ -359,14 +631,23 @@ at::Tensor convert_weight_packed(at::Tensor& weight) {
 
   const int64_t ndim = weight.ndimension();
   TORCH_CHECK(ndim == 2 || ndim == 3, "expect weight to be 2d or 3d, got ", ndim, "d tensor.");
+
+  if (ndim == 2 && weight.size(0) < TILE_N) {
+    // for 2D weight and small OC shape, we use fma linear path, which needs transpose not pack
+    return weight.to(at::kFloat).t().contiguous();
+  }
+
   const auto st = weight.scalar_type();
   const int64_t E = ndim == 3 ? weight.size(0) : 1;
   const int64_t OC = ndim == 3 ? weight.size(1) : weight.size(0);
   const int64_t IC = ndim == 3 ? weight.size(2) : weight.size(1);
 
+  // mxfp4 or int4 are packed with uint8
+  const int64_t actual_IC = st == at::kByte ? IC * 2 : IC;
+
   // we handle 2 TILE_N at a time.
   TORCH_CHECK(OC % TILE_N == 0, "invalid weight out features ", OC);
-  TORCH_CHECK(IC % TILE_K == 0, "invalid weight input features ", IC);
+  TORCH_CHECK(actual_IC % TILE_K == 0, "invalid weight input features ", actual_IC);
 
   constexpr int64_t BLOCK_N = block_size_n();
   const int64_t NB = div_up(OC, BLOCK_N);
@@ -375,12 +656,14 @@ at::Tensor convert_weight_packed(at::Tensor& weight) {
   auto packed_weight = at::empty({}, weight.options());
   const int64_t stride = OC * IC;
 
-  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf || st == at::kChar || st == at::kFloat8_e4m3fn,
-      "expect weight to be bfloat16, float16, int8 or fp8_e4m3.");
+  // Note: for `kByte` (uint8), it represents either `mxfp4` or `int4`.
+  TORCH_CHECK(
+      st == at::kBFloat16 || st == at::kHalf || st == at::kChar || st == at::kFloat8_e4m3fn || st == at::kByte,
+      "expect weight to be bfloat16, float16, int8, fp8_e4m3 or uint8(mxfp4 or int4).");
 
   CPU_DISPATCH_PACKED_TYPES(st, [&] {
     // adjust most inner dimension size
-    const int packed_row_size = get_row_size<packed_t>(IC);
+    const int packed_row_size = get_row_size<packed_t>(actual_IC);
     auto sizes = weight.sizes().vec();
     sizes[ndim - 1] = packed_row_size;
     packed_weight.resize_(sizes);
@@ -399,10 +682,7 @@ at::Tensor convert_weight_packed(at::Tensor& weight) {
         int64_t n = nb * BLOCK_N;
         int64_t n_size = std::min(BLOCK_N, OC - n);
         pack_vnni<packed_t>(
-            packed_data + e * OC * packed_row_size + n * packed_row_size,
-            w_data + e * stride + n * IC,
-            n_size,
-            IC);
+            packed_data + e * OC * packed_row_size + n * packed_row_size, w_data + e * stride + n * IC, n_size, IC);
 
         // move to the next index
         data_index_step(e, E, nb, NB);
@@ -412,33 +692,141 @@ at::Tensor convert_weight_packed(at::Tensor& weight) {
   return packed_weight;
 }
 
+at::Tensor convert_scale_packed(at::Tensor& scale) {
+  CHECK_INPUT(scale);
+
+  const int64_t ndim = scale.ndimension();
+  TORCH_CHECK(ndim == 2 || ndim == 3, "expect scale to be 2d or 3d, got ", ndim, "d tensor.");
+  const auto st = scale.scalar_type();
+  const int64_t E = ndim == 3 ? scale.size(0) : 1;
+  const int64_t N = ndim == 3 ? scale.size(1) : scale.size(0);
+  // number of groups, e.g. K/32
+  const int64_t G = ndim == 3 ? scale.size(2) : scale.size(1);
+
+  constexpr int64_t BLOCK_N = block_size_n();
+  TORCH_CHECK(N % BLOCK_N == 0, "invalid weight out features ", N);
+  const int64_t NB = N / BLOCK_N;
+
+  auto packed_scale = at::empty_like(scale);
+  TORCH_CHECK(st == at::kByte, "expect scale to be uint8.");
+
+  const uint8_t* s_data = scale.data_ptr<uint8_t>();
+  uint8_t* packed_data = packed_scale.data_ptr<uint8_t>();
+
+  // parallel on src {E, NB, BLOCK_N, G}, dst {E, NB, G, BLOCK_N}
+  at::parallel_for(0, E * NB * BLOCK_N * G, 0, [&](int64_t begin, int64_t end) {
+    int64_t e{0}, nb{0}, n{0}, g{0};
+    data_index_init(begin, e, E, nb, NB, n, BLOCK_N, g, G);
+
+    for (int64_t i = begin; i < end; ++i) {
+      packed_data[e * N * G + nb * G * BLOCK_N + g * BLOCK_N + n] = s_data[i];
+      // move to the next index
+      data_index_step(e, E, nb, NB, n, BLOCK_N, g, G);
+    }
+  });
+  return packed_scale;
+}
+
 // mat1 : [M, K]
-// mat2 : [N, K]
+// mat2 : [N, K] ([K, N] if use_fma_gemm)
 // bias : [N]
 // out  : [M, N]
 //
-at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
-    const std::optional<at::Tensor>& bias, bool is_vnni) {
-  RECORD_FUNCTION(
-    "sgl-kernel::weight_packed_linear", std::vector<c10::IValue>({mat1, mat2, bias}));
-
+at::Tensor
+weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2, const std::optional<at::Tensor>& bias, bool is_vnni) {
   auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+  bool use_fma_gemm = false;
+  if (packed_w.scalar_type() == at::kFloat) {
+    use_fma_gemm = true;
+  }
+
+  int64_t M = mat1.size(0);
+  int64_t K = mat1.size(1);
+  int64_t N = use_fma_gemm ? mat2.size(1) : mat2.size(0);
 
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
   CHECK_INPUT(mat2);
-
-  int64_t M = mat1.size(0);
-  int64_t N = mat2.size(0);
-  int64_t K = mat2.size(1);
-  CHECK_EQ(mat1.size(1), K);
   CHECK_DIM(2, mat1);
   CHECK_DIM(2, mat2);
+  if (!use_fma_gemm) {
+    CHECK_EQ(mat1.size(1), K);
+  }
 
+  auto dispatch_type = mat1.scalar_type();
   auto out = at::empty({M, N}, mat1.options());
-
   // strides
-  int64_t mat1_strideM = mat1.stride(0);
   int64_t out_strideM = out.stride(0);
+  int64_t mat1_strideM = mat1.stride(0);
+
+  const bool has_bias = bias.has_value();
+  const float* bias_data = nullptr;
+  if (has_bias) {
+    CHECK_EQ(bias.value().size(0), N);
+    bias_data = bias.value().data_ptr<float>();
+  }
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(dispatch_type, "weight_packed_linear_kernel_impl", [&] {
+    if (use_fma_gemm) {
+      weight_packed_linear_kernel_impl<scalar_t>(
+          out.data_ptr<scalar_t>(),
+          mat1.data_ptr<scalar_t>(),
+          packed_w.data_ptr<float>(),
+          bias_data,
+          nullptr,
+          M,
+          N,
+          K,
+          mat1_strideM,
+          out_strideM);
+    } else {
+      weight_packed_linear_kernel_impl<scalar_t>(
+          out.data_ptr<scalar_t>(),
+          mat1.data_ptr<scalar_t>(),
+          packed_w.data_ptr<scalar_t>(),
+          bias_data,
+          M,
+          N,
+          K,
+          mat1_strideM,
+          out_strideM);
+    }
+  });
+
+  return out;
+}
+
+// mat1         : [M, K]
+// mat2         : [K, 1]
+// post_mul_mat : [M, K]
+// bias         : [N]
+// out          : [M, N]
+//
+at::Tensor fused_linear_sigmoid_mul(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    const std::optional<at::Tensor>& bias,
+    bool is_vnni,
+    const at::Tensor& post_mul_mat) {
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+  TORCH_CHECK(packed_w.scalar_type() == at::kFloat, "fused_linear_sigmoid_mul requires packed float weight")
+
+  int64_t M = mat1.size(0);
+  int64_t K = mat1.size(1);
+  int64_t N = mat2.size(1);
+
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_DIM(2, mat1);
+  CHECK_DIM(2, mat2);
+
+  int64_t out_strideM = post_mul_mat.size(1);
+  int64_t mat1_strideM = mat1.stride(0);
+  auto dispatch_type = mat1.scalar_type();
+  auto out = at::empty({M, out_strideM}, mat1.options());
+
+  TORCH_CHECK(
+      N == 1 && out_strideM % 32 == 0,
+      "post_mul_mat tensor size(1) should be 32 dividable, and the mat2 OC=1 (Mx1 as linear output shape)")
 
   const bool has_bias = bias.has_value();
   const float* bias_data = nullptr;
@@ -447,12 +835,13 @@ at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
     bias_data = bias.value().data_ptr<float>();
   }
 
-  AT_DISPATCH_REDUCED_FLOATING_TYPES(mat1.scalar_type(), "weight_packed_linear_kernel_impl", [&] {
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(dispatch_type, "fused_linear_sigmoid_mul", [&] {
     weight_packed_linear_kernel_impl<scalar_t>(
         out.data_ptr<scalar_t>(),
         mat1.data_ptr<scalar_t>(),
-        packed_w.data_ptr<scalar_t>(),
+        packed_w.data_ptr<float>(),
         bias_data,
+        post_mul_mat.data_ptr<scalar_t>(),
         M,
         N,
         K,
diff --git a/csrc/cpu/sgl-kernels/gemm.h b/csrc/cpu/sgl-kernels/gemm.h
index aa78c8807fb8..b43be94ffdbb 100644
--- a/csrc/cpu/sgl-kernels/gemm.h
+++ b/csrc/cpu/sgl-kernels/gemm.h
@@ -1,8 +1,12 @@
-#pragma once
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
 
+#pragma once
 #include <ATen/native/CPUBlas.h>
 
-// clang-format off
+#include "common.h"
 
 // amx-bf16
 #define TILE_M 16
@@ -10,20 +14,42 @@
 #define TILE_K 32
 
 // block size for AMX gemm
-constexpr int block_size_m() { return 2 * TILE_M; }
-constexpr int block_size_n() { return 2 * TILE_N; }
+constexpr int block_size_m() {
+  return 2 * TILE_M;
+}
+constexpr int block_size_n() {
+  return 2 * TILE_N;
+}
 
 // define threshold using brgemm (intel AMX)
-template <typename T> inline bool can_use_brgemm(int M);
-template <> inline bool can_use_brgemm<at::BFloat16>(int M) { return M > 4; }
-template <> inline bool can_use_brgemm<at::Half>(int M) { return true; }
-template <> inline bool can_use_brgemm<int8_t>(int M) { return M > 4; }
-template <> inline bool can_use_brgemm<uint8_t>(int M) { return M > 4; }
-template <> inline bool can_use_brgemm<at::Float8_e4m3fn>(int M) { return M > 4; }
-template <> inline bool can_use_brgemm<at::quint4x2>(int M) { return M > 4; }
+template <typename T>
+inline bool can_use_brgemm(int M);
+template <>
+inline bool can_use_brgemm<at::BFloat16>(int M) {
+  return M > 4;
+}
+template <>
+inline bool can_use_brgemm<at::Half>(int M) {
+  return true;
+}
+// this requires PyTorch 2.7 or above
+template <>
+inline bool can_use_brgemm<int8_t>(int M) {
+  return M > 4;
+}
+
+template <>
+inline bool can_use_brgemm<uint8_t>(int M) {
+  return M > 4;
+}
+
+template <>
+inline bool can_use_brgemm<at::Float8_e4m3fn>(int M) {
+  return M > 4;
+}
 
 // work around compiler internal error
-#define BLOCK_K 128 // 4 * TILE_K
+#define BLOCK_K 128  // 4 * TILE_K
 
 // adjust leading dimension size for K
 template <typename T>
@@ -36,18 +62,44 @@ inline int64_t get_row_size<int8_t>(int64_t K) {
   return K + sizeof(int32_t);
 }
 
+// uint8: mxfp4 or int4
+template <>
+inline int64_t get_row_size<uint8_t>(int64_t K) {
+  return K >> 1;
+}
+
 inline int64_t get_row_size(int64_t K, bool use_int8_w8a8) {
   return use_int8_w8a8 ? K + sizeof(int32_t) : K;
 }
 
+enum class CPUQuantMethod : int64_t { BF16 = 0, INT8_W8A8 = 1, FP8_W8A16 = 2, INT4_W4A8 = 3 };
+
+constexpr bool operator==(CPUQuantMethod a, int64_t b) {
+  return static_cast<int64_t>(a) == b;
+}
+
+constexpr bool operator==(int64_t a, CPUQuantMethod b) {
+  return a == static_cast<int64_t>(b);
+}
+
+enum class CPUQuantAlgo : int64_t { AWQ = 0, GPTQ = 1 };
+
+constexpr bool operator==(CPUQuantAlgo a, int64_t b) {
+  return static_cast<int64_t>(a) == b;
+}
+
+constexpr bool operator==(int64_t a, CPUQuantAlgo b) {
+  return a == static_cast<int64_t>(b);
+}
+
 inline int64_t get_4bit_block_k_size(int64_t group_size) {
   return group_size > 128 ? 128 : group_size;
 }
 
-// pack weight into vnni format
+// pack weight to vnni format
 at::Tensor convert_weight_packed(at::Tensor& weight);
 
-// pack weight to vnni format for int4 (adapted from sglang)
+// pack weight to vnni format for int4
 std::tuple<at::Tensor, at::Tensor, at::Tensor>
 convert_weight_packed_scale_zp(at::Tensor qweight, at::Tensor qzeros, at::Tensor scales);
 
@@ -105,35 +157,6 @@ void fused_experts_fp8_kernel_impl(
     int64_t topk,
     int64_t num_tokens_post_pad);
 
-// moe implementations for int4 w4a16
-template <typename scalar_t>
-void fused_experts_int4_w4a16_kernel_impl(
-    scalar_t* __restrict__ output,
-    scalar_t* __restrict__ ic0,
-    scalar_t* __restrict__ ic1,
-    scalar_t* __restrict__ ic2,
-    scalar_t* __restrict__ A_tmp,
-    scalar_t* __restrict__ B_tmp,
-    float* __restrict__ C_tmp,
-    const scalar_t* __restrict__ input,
-    const at::quint4x2* __restrict__ packed_w1,
-    const at::quint4x2* __restrict__ packed_w2,
-    const uint8_t* __restrict__ w1z,
-    const uint8_t* __restrict__ w2z,
-    const scalar_t* __restrict__ w1s,
-    const scalar_t* __restrict__ w2s,
-    int group_size,
-    const float* __restrict__ topk_weights,
-    const int32_t* __restrict__ sorted_ids,
-    const int32_t* __restrict__ expert_ids,
-    const int32_t* __restrict__ offsets,
-    int64_t M,
-    int64_t N,
-    int64_t K,
-    int64_t E,
-    int64_t topk,
-    int64_t num_tokens_post_pad);
-
 // shared expert implementation for int8 w8a8
 template <typename scalar_t>
 void shared_expert_int8_kernel_impl(
@@ -153,6 +176,37 @@ void shared_expert_int8_kernel_impl(
     int64_t N,
     int64_t K);
 
+template <typename scalar_t>
+void fused_experts_int4_w4a8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    int32_t* __restrict__ Azp_tmp,
+    float* __restrict__ C_tmp,
+    int8_t* __restrict__ dqB_tmp,
+    const scalar_t* __restrict__ input,
+    const uint8_t* __restrict__ packed_w1,
+    const uint8_t* __restrict__ packed_w2,
+    const int8_t* __restrict__ w1z,
+    const int8_t* __restrict__ w2z,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int group_size,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad);
+
 template <typename scalar_t>
 void shared_expert_fp8_kernel_impl(
     scalar_t* __restrict__ output,
@@ -204,6 +258,7 @@ void tinygemm_kernel(
     int64_t ldc,
     bool brg);
 
+// block quantization
 template <typename scalar_t>
 void tinygemm_kernel(
     const scalar_t* __restrict__ A,
@@ -219,33 +274,26 @@ void tinygemm_kernel(
     int64_t ldb,
     int64_t ldc,
     bool brg,
-    int64_t block_size_K);
+    int64_t block_size_K,
+    bool do_unpack = true);
 
+// per tensor quantization
 template <typename scalar_t>
 void tinygemm_kernel(
     const scalar_t* __restrict__ A,
-    const at::quint4x2* __restrict__ B,
+    const at::Float8_e4m3fn* __restrict__ B,
     scalar_t* __restrict__ C,
-    const uint8_t* __restrict__ Bz,
-    const scalar_t* __restrict__ Bs,
     scalar_t* __restrict__ Btmp,
     float* __restrict__ Ctmp,
+    float scale,
     int64_t M,
     int64_t N,
     int64_t K,
-    int group_size,
     int64_t lda,
     int64_t ldb,
     int64_t ldc,
-    int64_t strideBz,
-    int64_t strideBs,
     bool brg);
 
-// int4 scaled GEMM (adapted from sglang)
-at::Tensor int4_scaled_mm_cpu(
-    at::Tensor& x, at::Tensor& w, at::Tensor& w_zeros, at::Tensor& w_scales, std::optional<at::Tensor> bias);
-
-// int4 tinygemm kernel interface(adapted from sglang)
 template <typename scalar_t>
 void tinygemm_kernel(
     scalar_t* C,
@@ -266,34 +314,21 @@ void tinygemm_kernel(
     bool store_out,
     bool use_brgemm);
 
-// TODO: debug print, remove me later
-inline void print_16x32i(const __m512i x) {
-  int32_t a[16];
-  _mm512_storeu_si512((__m512i *)a, x);
-
-  for (int i = 0; i < 16; i++){
-    std::cout << a[i] << " ";
-  }
-  std::cout << std::endl;
-}
-
-inline void print_16x32(const __m512 x) {
-  float a[16];
-  _mm512_storeu_ps((__m512 *)a, x);
-
-  for (int i = 0; i < 16; i++){
-    std::cout << a[i] << " ";
-  }
-  std::cout << std::endl;
-}
-
-
-inline void print_32x8u(const __m256i x) {
-  uint8_t a[32];
-  _mm256_storeu_si256((__m256i *)a, x);
-
-  for (int i = 0; i < 32; ++i) {
-    std::cout << int32_t(a[i]) << " ";
-  }
-  std::cout << std::endl;
-}
+// mxfp4
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const uint8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const uint8_t* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K,
+    bool do_unpack = true);
diff --git a/csrc/cpu/sgl-kernels/gemm_fp8.cpp b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
index ef29181cee56..dfc304468571 100644
--- a/csrc/cpu/sgl-kernels/gemm_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_fp8.cpp
@@ -1,14 +1,11 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
-#include "common.h"
-#include "vec.h"
-#include "gemm.h"
-
 // clang-format off
 
-// we use 4x32 for BLOCK_M
-#define BLOCK_SIZE_M_SCALE 4
+#include "common.h"
+#include "gemm.h"
+#include "vec.h"
 
 namespace {
 
@@ -19,7 +16,7 @@ inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ inpu
   constexpr int kVecSize = bVec::size();
 
   int64_t d;
-  #pragma GCC unroll 4
+#pragma GCC unroll 4
   for (d = 0; d <= size - kVecSize; d += kVecSize) {
     fVec data0 = fVec::loadu(input + d);
     fVec data1 = fVec::loadu(input + d + fVec::size());
@@ -32,13 +29,14 @@ inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ inpu
 }
 
 template <typename scalar_t>
-inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
+inline void copy_add_stub(
+    scalar_t* __restrict__ out, const float* __restrict__ input, const float* __restrict__ bias, int64_t size) {
   using bVec = at::vec::Vectorized<scalar_t>;
   using fVec = at::vec::Vectorized<float>;
   constexpr int kVecSize = bVec::size();
 
   int64_t d;
-  #pragma GCC unroll 4
+#pragma GCC unroll 4
   for (d = 0; d <= size - kVecSize; d += kVecSize) {
     fVec data0 = fVec::loadu(input + d) + fVec::loadu(bias + d);
     fVec data1 = fVec::loadu(input + d + fVec::size()) + fVec::loadu(bias + d + fVec::size());
@@ -49,21 +47,41 @@ inline void copy_add_stub(scalar_t* __restrict__ out, const float* __restrict__
     out[d] = static_cast<scalar_t>(input[d] + bias[d]);
   }
 }
+template <typename scalar_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int size, float scale) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec vscale = fVec(scale);
+
+  int d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    fVec data0 = fVec::loadu(input + d) * vscale;
+    fVec data1 = fVec::loadu(input + d + fVec::size()) * vscale;
+    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * scale);
+  }
+}
 
 inline void unpack_B(
     at::BFloat16* __restrict__ Btmp,
     const at::Float8_e4m3fn* __restrict__ packed_B,
-    int N,
-    int K,
-    int ldb,
-    int ldb_tmp,
+    int64_t N,
+    int64_t K,
+    int64_t ldb,
+    int64_t ldb_tmp,
     float scale) {
 #if defined(CPU_CAPABILITY_AVX512)
   // [K/2, N, 2]
-  const int K2 = K >> 1;
-  const int ldb2 = ldb; // ldb * 2 >> 1;
+  const int64_t K2 = K >> 1;
+  const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
   const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(packed_B);
-  const __m512 vd = _mm512_set1_ps(scale);
+  const __m512 vexp = _mm512_castsi512_ps(_mm512_set1_epi32(kFP8_BIAS));
+  const __m512 vd = _mm512_mul_ps(_mm512_set1_ps(scale), vexp);
 
   constexpr int BLOCK_N = block_size_n();
   static_assert(BLOCK_N == 32);
@@ -72,7 +90,7 @@ inline void unpack_B(
   constexpr int PREFETCH_SIZE_K = 64;
 
 #pragma GCC unroll 4
-  for (int k = 0; k < K2; ++k) {
+  for (int64_t k = 0; k < K2; ++k) {
     __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2);
     if constexpr (PREFETCH_SIZE_K > 0) {
       _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
@@ -81,8 +99,8 @@ inline void unpack_B(
     __m256i b8_0 = _mm512_extracti32x8_epi32(b8, 0);
     __m256i b8_1 = _mm512_extracti32x8_epi32(b8, 1);
 
-    __m512bh bf16_0 = CVT_FP8_TO_BF16(b8_0);
-    __m512bh bf16_1 = CVT_FP8_TO_BF16(b8_1);
+    __m512bh bf16_0 = CVT_FP8_TO_BF16_EXT(b8_0);
+    __m512bh bf16_1 = CVT_FP8_TO_BF16_EXT(b8_1);
 
     // Apply scale
     __m512 f0_lo = CVT_BF16_TO_FP32(_mm512_extracti32x8_epi32((__m512i)bf16_0, 0));
@@ -106,26 +124,139 @@ inline void unpack_B(
 #endif
 }
 
-template <typename scalar_t, typename packed_t, bool has_bias, int BLOCK_M, int BLOCK_N>
+inline void unpack_B(
+    at::BFloat16* __restrict__ Btmp,
+    const at::Float8_e4m3fn* __restrict__ packed_B,
+    int N,
+    int K,
+    int ldb,
+    int ldb_tmp) {
+#if defined(CPU_CAPABILITY_AVX512)
+  // [K/2, N, 2]
+  const int K2 = K >> 1;
+  const int ldb2 = ldb;  // ldb * 2 >> 1;
+  const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(packed_B);
+
+  // prefetch distance
+  constexpr int PREFETCH_SIZE_K = 64;
+#pragma GCC unroll 4
+  for (int k = 0; k < K2; ++k) {
+    __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2);
+    if constexpr (PREFETCH_SIZE_K > 0) {
+      _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
+    }
+
+    __m256i b8_0 = _mm512_extracti32x8_epi32(b8, 0);
+    __m256i b8_1 = _mm512_extracti32x8_epi32(b8, 1);
+
+    __m512bh bf16_0 = CVT_FP8_TO_BF16(b8_0);
+    __m512bh bf16_1 = CVT_FP8_TO_BF16(b8_1);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 0, (__m512i)bf16_0);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 32, (__m512i)bf16_1);
+  }
+#else
+  TORCH_CHECK(false, "unpack_B: scalar path not implemented!");
+#endif
+}
+
+// mxfp4
+inline void unpack_B(
+    at::BFloat16* __restrict__ Btmp,
+    const uint8_t* __restrict__ packed_B,
+    int64_t N,
+    int64_t K,
+    int64_t ldb,
+    int64_t ldb_tmp,
+    const uint8_t* __restrict__ scale) {
+#if defined(CPU_CAPABILITY_AVX512)
+  // [K/2, N, 2]
+  const int64_t K2 = K >> 1;
+  const int64_t ldb2 = ldb;                                           // ldb * 2 >> 1;
+  const uint8_t* b_ptr = reinterpret_cast<const uint8_t*>(packed_B);  // 2 * 4 bit = 8 bit
+
+  constexpr int BLOCK_N = block_size_n();
+  static_assert(BLOCK_N == 32);
+
+  // prefetch distance
+  constexpr int PREFETCH_SIZE_K = 64;
+
+  // exponent bias 127
+  const __m512i off = _mm512_set1_epi16(0x7F);
+
+  // load 32 bytes only once for each block
+  __m256i s8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scale));
+  __m512i s16 = _mm512_slli_epi16(_mm512_sub_epi16(_mm512_cvtepu8_epi16(s8), off), 0x7);
+
+  // holds Nx2(64) scales, interleaved as 2 belongs to K dimension
+  // e.g. vs0: { s0,  s0,  s1,  s1, ..., s15, s15}
+  //      vs1: {s16, s16, s17, s17, ..., s31, s31}
+  auto [vscale0, vscale1] = transpose_2x32_16bit(s16, s16);
+
+#pragma GCC unroll 4
+  for (int64_t k = 0; k < K2; ++k) {
+    __m256i b4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(b_ptr + k * ldb2));
+    if constexpr (PREFETCH_SIZE_K > 0) {
+      _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2, _MM_HINT_T0);
+    }
+    auto [vb0, vb1] = CVT_MXFP4_TO_BF16(b4, vscale0, vscale1);
+
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 0, (__m512i)vb0);
+    _mm512_storeu_si512(Btmp + k * ldb_tmp * 2 + 32, (__m512i)vb1);
+  }
+#else
+  TORCH_CHECK(false, "unpack_B: scalar path not implemented!");
+#endif
+}
+
+template <typename scalar_t, typename packed_t, typename param_t, bool has_bias, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn {
   static inline void apply(
-      const scalar_t* __restrict__ A, const packed_t* __restrict__ B, scalar_t* __restrict__ C,
-      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
+      const scalar_t* __restrict__ A,
+      const packed_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ bias,
+      const param_t* __restrict__ scale,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t block_size_K) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
 
+template <typename scalar_t, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2 {
+  static inline void apply(
+      const scalar_t* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      scalar_t* __restrict__ C,
+      float scale,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+    TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
+  }
+};
 #if defined(CPU_CAPABILITY_AVX512)
 template <bool has_bias, int BLOCK_M, int BLOCK_N>
-struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BLOCK_N> {
+struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, float, has_bias, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const at::BFloat16* __restrict__ A, const at::Float8_e4m3fn* __restrict__ B, at::BFloat16* __restrict__ C,
-      const float* __restrict__ bias, const float* __restrict__ scale, int K, int lda, int ldb, int ldc, int64_t block_size_K) {
-
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias,
+      const float* __restrict__ scale,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc,
+      int64_t block_size_K) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
 
-    const int KB = div_up(K, BLOCK_K);
+    const int64_t KB = div_up(K, (int64_t)BLOCK_K);
 
     // prefetch distance
     constexpr int PREFETCH_SIZE_K = 64;
@@ -139,6 +270,8 @@ struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BL
     // block quant scale
     __m512 vscale;
 
+    const __m512 vexp = _mm512_castsi512_ps(_mm512_set1_epi32(kFP8_BIAS));
+
     auto loadc = [&](auto i) {
       constexpr int col = i % COLS;
       if constexpr (has_bias) {
@@ -149,8 +282,8 @@ struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BL
     };
     Unroll<ROWS * COLS>{}(loadc);
 
-    const int lda2 = lda >> 1;
-    const int ldb2 = ldb; // ldb * 2 >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
     const float* a_ptr = reinterpret_cast<const float*>(A);
     const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(B);
 
@@ -170,34 +303,31 @@ struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BL
           if constexpr (PREFETCH_SIZE_K > 0) {
             _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
           }
-          vb[col + 0] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 0));
-          vb[col + 1] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 1));
+          vb[col + 0] = CVT_FP8_TO_BF16_EXT(_mm512_extracti32x8_epi32(b8, 0));
+          vb[col + 1] = CVT_FP8_TO_BF16_EXT(_mm512_extracti32x8_epi32(b8, 1));
         }
       }
       vsum[i] = _mm512_dpbf16_ps(vsum[i], va, vb[col]);
     };
 
-    constexpr int BLOCK_K2 = BLOCK_K >> 1;
-    for (int kb = 0; kb < KB; ++kb) {
-      int kb_start = kb * BLOCK_K2;
-      int kb_end = std::min(K, kb_start + BLOCK_K2);
+    constexpr int64_t BLOCK_K2 = BLOCK_K >> 1;
+    for (int64_t kb = 0; kb < KB; ++kb) {
+      int64_t kb_start = kb * BLOCK_K2;
+      int64_t kb_end = std::min(K >> 1, kb_start + BLOCK_K2);
       // 1. load scale vector
       vscale = _mm512_set1_ps(scale[kb]);
+      vscale = _mm512_mul_ps(vscale, vexp);
       if constexpr (PREFETCH_SIZE_KB > 0) {
         _mm_prefetch(scale + kb + PREFETCH_SIZE_KB, _MM_HINT_T0);
       }
       // 2. zero vsum for each block
-      Unroll<ROWS * COLS>{}([&](auto i) {
-        vsum[i] = _mm512_setzero_ps();
-      });
+      Unroll<ROWS * COLS>{}([&](auto i) { vsum[i] = _mm512_setzero_ps(); });
       // 3. accumulate across each block
       for (int k = kb_start; k < kb_end; ++k) {
         Unroll<ROWS * COLS>{}(compute, k);
       }
       // 4. apply scale
-      Unroll<ROWS * COLS>{}([&](auto i) {
-        vc[i] = _mm512_fmadd_ps(vsum[i], vscale, vc[i]);
-      });
+      Unroll<ROWS * COLS>{}([&](auto i) { vc[i] = _mm512_fmadd_ps(vsum[i], vscale, vc[i]); });
     }
 
     auto storec = [&](auto i) {
@@ -213,14 +343,196 @@ struct tinygemm_kernel_nn<at::BFloat16, at::Float8_e4m3fn, has_bias, BLOCK_M, BL
     Unroll<ROWS * COLS>{}(storec);
   }
 };
-#endif
 
-#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
-    tinygemm_kernel_nn<scalar_t, at::Float8_e4m3fn, has_bias, MB_SIZE, NB_SIZE>::apply(         \
-        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
-        has_bias ? bias + nb_start : nullptr, scale, K, lda, ldb, ldc, block_size_K);
+template <int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      float scale,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 64;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    const __m512 vscale = _mm512_set1_ps(scale);
+
+    auto loadc = [&](auto i) { vc[i] = _mm512_setzero_ps(); };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int K2 = K >> 1;
+    const int lda2 = lda >> 1;
+    const int ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const uint16_t* b_ptr = reinterpret_cast<const uint16_t*>(B);
+
+    auto compute = [&](auto i, int k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+      }
+      if constexpr (row == 0) {
+        if constexpr (col % 2 == 0) {
+          __m512i b8 = _mm512_loadu_si512(b_ptr + k * ldb2 + col * 16);
+          if constexpr (PREFETCH_SIZE_K > 0) {
+            _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          }
+          vb[col + 0] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 0));
+          vb[col + 1] = CVT_FP8_TO_BF16(_mm512_extracti32x8_epi32(b8, 1));
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+    for (int k = 0; k < K2; ++k) {
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        __m512 vc0 = _mm512_mul_ps(vc[row * COLS + col + 0], vscale);
+        __m512 vc1 = _mm512_mul_ps(vc[row * COLS + col + 1], vscale);
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)), (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+
+template <bool has_bias, int BLOCK_M, int BLOCK_N>
+struct tinygemm_kernel_nn<at::BFloat16, uint8_t, uint8_t, has_bias, BLOCK_M, BLOCK_N> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const uint8_t* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ bias,
+      const uint8_t* __restrict__ scale,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      int64_t block_size_K) {
+    // mxfp4 supports only group size of 32
+    // expect weight packed in 32-way, vnni2 format Nx2(64)
+    assert(block_size_K == 32);
+    assert(BLOCK_N == 32);
+
+    constexpr int ROWS = BLOCK_M;
+    constexpr int COLS = BLOCK_N / 16;
+
+    // prefetch distance
+    constexpr int PREFETCH_SIZE_K = 64;
+    constexpr int PREFETCH_SIZE_KB = 1;
+
+    __m512bh va;
+    __m512bh vb[COLS];
+    __m512 vc[ROWS * COLS];
+
+    // holds Nx2(64) scales, interleaved as 2 belongs to K dimension
+    // e.g. vs0: { s0,  s0,  s1,  s1, ..., s15, s15}
+    //      vs1: {s16, s16, s17, s17, ..., s31, s31}
+    __m512i vscale[COLS];
+
+    // exponent bias 127
+    const __m512i off = _mm512_set1_epi16(0x7F);
+
+    auto loadc = [&](auto i) {
+      constexpr int col = i % COLS;
+      if constexpr (has_bias) {
+        vc[i] = _mm512_loadu_ps(bias + col * 16);
+      } else {
+        vc[i] = _mm512_setzero_ps();
+      }
+    };
+    Unroll<ROWS * COLS>{}(loadc);
+
+    const int64_t K2 = K >> 1;
+    const int64_t lda2 = lda >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
+    const float* a_ptr = reinterpret_cast<const float*>(A);
+    const uint8_t* b_ptr = reinterpret_cast<const uint8_t*>(B);
+
+    auto compute = [&](auto i, int k) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+
+      if constexpr (col == 0) {
+        va = (__m512bh)(_mm512_set1_ps(a_ptr[row * lda2 + k]));
+        if constexpr (PREFETCH_SIZE_K > 0) {
+          _mm_prefetch(a_ptr + row * lda2 + k + PREFETCH_SIZE_K, _MM_HINT_T0);
+        }
+      }
+      if constexpr (row == 0) {
+        // load 32 * 2 (64) int4 at a time
+        if constexpr (col % 2 == 0) {
+          __m256i b4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(b_ptr + k * ldb2 + col * 16));
+          if constexpr (PREFETCH_SIZE_K > 0) {
+            _mm_prefetch(b_ptr + (k + PREFETCH_SIZE_K) * ldb2 + col * 16, _MM_HINT_T0);
+          }
+          std::tie(vb[col + 0], vb[col + 1]) = CVT_MXFP4_TO_BF16(b4, vscale[col + 0], vscale[col + 1]);
+        }
+      }
+      vc[i] = _mm512_dpbf16_ps(vc[i], va, vb[col]);
+    };
+
+    for (int64_t k = 0; k < K2; ++k) {
+      // update scales every 16x2 K
+      if ((k & 15) == 0) {
+        __m256i s8 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(scale + (k >> 4) * 32));
+        __m512i s16 = _mm512_slli_epi16(_mm512_sub_epi16(_mm512_cvtepu8_epi16(s8), off), 0x7);
+        std::tie(vscale[0], vscale[1]) = transpose_2x32_16bit(s16, s16);
+      }
+      Unroll<ROWS * COLS>{}(compute, k);
+    }
+
+    auto storec = [&](auto i) {
+      constexpr int row = i / COLS;
+      constexpr int col = i % COLS;
+      // for COLS = 2,4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
+            (__m512i)(_mm512_cvtne2ps_pbh(vc[row * COLS + col + 1], vc[row * COLS + col])));
+      }
+    };
+    Unroll<ROWS * COLS>{}(storec);
+  }
+};
+#endif
 
-template <typename scalar_t, typename packed_t, bool has_bias>
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                                   \
+  tinygemm_kernel_nn<scalar_t, packed_t, param_t, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                                             \
+      B + nb_start * 2,                                                               \
+      C + mb_start * ldc + nb_start,                                                  \
+      has_bias ? bias + nb_start : nullptr,                                           \
+      scale,                                                                          \
+      K,                                                                              \
+      lda,                                                                            \
+      ldb,                                                                            \
+      ldc,                                                                            \
+      block_size_K);
+
+#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)      \
+  tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, scale, K, lda, ldb, ldc);
+
+template <typename scalar_t, typename packed_t, typename param_t, bool has_bias>
 struct brgemm {
   static inline void apply(
       const scalar_t* __restrict__ A,
@@ -229,19 +541,22 @@ struct brgemm {
       scalar_t* __restrict__ Btmp,
       float* __restrict__ Ctmp,
       const float* __restrict__ bias,
-      const float* __restrict__ scale,
+      const param_t* __restrict__ scale,
       int M,
       int N,
       int K,
       int lda,
       int ldb,
-      int ldc) {
+      int ldc,
+      bool do_unpack = true) {
     TORCH_CHECK(false, "struct brgemm: primary template not implemented!");
   }
 };
+template <typename scalar_t>
+struct brgemm2 {};
 
 template <bool has_bias>
-struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
+struct brgemm<at::BFloat16, at::Float8_e4m3fn, float, has_bias> {
   static inline void apply(
       const at::BFloat16* __restrict__ A,
       const at::Float8_e4m3fn* __restrict__ B,
@@ -255,22 +570,101 @@ struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
       int K,
       int lda,
       int ldb,
-      int ldc) {
-
+      int ldc,
+      bool do_unpack = true) {
     constexpr int BLOCK_N = block_size_n();
 
     // [K, BLOCK_N] -> [K / 2, BLOCK_N * 2]
     const int ldb_tmp = BLOCK_N;
 
+    if (do_unpack) {
+      for (int k = 0; k < K; k += BLOCK_K) {
+        int kb_size = std::min(BLOCK_K, K - k);
+
+        int idx = k >> 7;  // k / BLOCK_K where BLOCK_K = 128
+        unpack_B(Btmp + k * ldb_tmp, B + k * ldb, N, kb_size, ldb, ldb_tmp, scale[idx]);
+      }
+    }
+
+    at::native::cpublas::brgemm(M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
+
+    // copy from Ctmp to C
+    for (int m = 0; m < M; ++m) {
+      if constexpr (has_bias) {
+        copy_add_stub(C + m * ldc, Ctmp + m * BLOCK_N, bias, N);
+      } else {
+        copy_stub(C + m * ldc, Ctmp + m * BLOCK_N, N);
+      }
+    }
+  }
+};
+
+template <>
+struct brgemm2<at::BFloat16> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const at::Float8_e4m3fn* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      at::BFloat16* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      float scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc) {
+    constexpr int BLOCK_N = block_size_n();
+
+    // [BLOCK_K, BLOCK_N] -> [BLOCK_K / 2, BLOCK_N * 2]
+    const int ldb_tmp = block_size_n();
+
+    // accumulate across K per BLOCK_K
     for (int k = 0; k < K; k += BLOCK_K) {
       int kb_size = std::min(BLOCK_K, K - k);
+      unpack_B(Btmp, B + k * ldb, N, kb_size, ldb, ldb_tmp);
+
+      const bool add_C = (k != 0);
+      at::native::cpublas::brgemm(M, N, kb_size, lda, ldb_tmp, BLOCK_N, add_C, A + k, Btmp, Ctmp);
+    }
+
+    // copy from Ctmp to C and mul scale
+    for (int m = 0; m < M; ++m) {
+      copy_mul_stub(C + m * ldc, Ctmp + m * BLOCK_N, N, scale);
+    }
+  }
+};
+
+template <bool has_bias>
+struct brgemm<at::BFloat16, uint8_t, uint8_t, has_bias> {
+  static inline void apply(
+      const at::BFloat16* __restrict__ A,
+      const uint8_t* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      at::BFloat16* __restrict__ Btmp,
+      float* __restrict__ Ctmp,
+      const float* __restrict__ bias,
+      const uint8_t* __restrict__ scale,
+      int M,
+      int N,
+      int K,
+      int lda,
+      int ldb,
+      int ldc,
+      bool do_unpack = true) {
+    constexpr int BLOCK_N = block_size_n();
+
+    // [K, BLOCK_N] -> [K / 2, BLOCK_N * 2]
+    const int ldb_tmp = BLOCK_N;
 
-      int idx = k >> 7; // k / BLOCK_K where BLOCK_K = 128
-      unpack_B(Btmp + k * ldb_tmp, B + k * ldb, N, kb_size, ldb, ldb_tmp, scale[idx]);
+    if (do_unpack) {
+      // group size 32 for mxfp4
+      for (int k = 0; k < K; k += 32) {
+        unpack_B(Btmp + k * ldb_tmp, B + k * (ldb >> 1), N, 32, ldb, ldb_tmp, scale + (k >> 5) * BLOCK_N);
+      }
     }
 
-    at::native::cpublas::brgemm(
-        M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
+    at::native::cpublas::brgemm(M, N, K, lda, ldb_tmp, BLOCK_N, /* add_C */ false, A, Btmp, Ctmp);
 
     // copy from Ctmp to C
     for (int m = 0; m < M; ++m) {
@@ -283,14 +677,14 @@ struct brgemm<at::BFloat16, at::Float8_e4m3fn, has_bias> {
   }
 };
 
-template <typename scalar_t, bool has_bias>
+template <typename scalar_t, typename packed_t, typename param_t, bool has_bias>
 void tinygemm_kernel(
     const scalar_t* __restrict__ A,
-    const at::Float8_e4m3fn* __restrict__ B,
+    const packed_t* __restrict__ B,
     scalar_t* __restrict__ C,
     scalar_t* __restrict__ Btmp,
     float* __restrict__ Ctmp,
-    const float* __restrict__ scale,
+    const param_t* __restrict__ scale,
     const float* __restrict__ bias,
     int64_t M,
     int64_t N,
@@ -299,11 +693,11 @@ void tinygemm_kernel(
     int64_t ldb,
     int64_t ldc,
     bool brg,
-    int64_t block_size_K) {
-
+    int64_t block_size_K,
+    bool do_unpack = true) {
   if (brg) {
-    brgemm<scalar_t, at::Float8_e4m3fn, has_bias>::apply(
-        A, B, C, Btmp, Ctmp, bias, scale, M, N, K, lda, ldb, ldc);
+    brgemm<scalar_t, packed_t, param_t, has_bias>::apply(
+        A, B, C, Btmp, Ctmp, bias, scale, M, N, K, lda, ldb, ldc, do_unpack);
     return;
   }
 
@@ -319,23 +713,136 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
 }
 
 template <typename scalar_t>
-void fp8_scaled_mm_kernel_impl(
+void tinygemm_kernel2(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    float scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  if (brg) {
+    brgemm2<scalar_t>::apply(A, B, C, Btmp, Ctmp, scale, M, N, K, lda, ldb, ldc);
+    return;
+  }
+
+  // pattern: 1-8-8
+  if (M == 1) {
+    constexpr int64_t BLOCK_N = 128;
+    const int64_t NB = div_up(N, BLOCK_N);
+    int64_t mb_start = 0;
+
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (nb_size >> 4) {
+        case 2:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 32);
+          break;
+        case 4:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 64);
+          break;
+        case 6:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 96);
+          break;
+        case 8:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 128);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, 1x", "nb_size");
+      }
+    }
+    return;
+  }
+
+  // pattern: 1-4-16
+  constexpr int64_t BLOCK_M = 4;
+  constexpr int64_t BLOCK_N = 64;
+  const int64_t MB = div_up(M, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+  for (int64_t mb = 0; mb < MB; ++mb) {
+    int64_t mb_start = mb * BLOCK_M;
+    int64_t mb_size = std::min(BLOCK_M, M - mb_start);
+    for (int64_t nb = 0; nb < NB; ++nb) {
+      int64_t nb_start = nb * BLOCK_N;
+      int64_t nb_size = std::min(BLOCK_N, N - nb_start);
+
+      switch (mb_size << 4 | nb_size >> 4) {
+        // mb_size = 1
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 32);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 64);
+          break;
+        // mb_size = 2
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN2(2, 32);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN2(2, 64);
+          break;
+        // mb_size = 3
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN2(3, 32);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN2(3, 64);
+          break;
+        // mb_size = 4
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN2(4, 32);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN2(4, 64);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
+      }
+    }
+  }
+}
+
+// NB: fp8/fp4 scaled mm kernel implementation
+//
+//        scalar_t     packed_t     param_t
+//   FP8    BF16         FP8         FP32
+//  MXFP4   BF16          U8           U8
+//
+template <typename scalar_t, typename packed_t, typename param_t, typename func_t>
+void fp_scaled_mm_kernel_impl(
     scalar_t* __restrict__ out,
     const scalar_t* __restrict__ mat1,
-    const at::Float8_e4m3fn* __restrict__ mat2,
-    const float* __restrict__ scales2,
+    const packed_t* __restrict__ mat2,
+    const param_t* __restrict__ scales2,
     const float* __restrict__ bias,
     scalar_t* __restrict__ buffer,
     int64_t M,
@@ -345,42 +852,41 @@ void fp8_scaled_mm_kernel_impl(
     int64_t out_strideM,
     int64_t block_size_N,
     int64_t block_size_K,
-    int64_t buffer_size_per_thread) {
-
-  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
+    int64_t buffer_size_per_thread,
+    const func_t& scale_offset_per_block) {
+  constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
   const int64_t MB = div_up(M, BLOCK_M);
   const int64_t NB = div_up(N, BLOCK_N);
 
-  const int64_t scale_size_K = div_up(K, block_size_K);
-  const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+  const bool use_brgemm = can_use_brgemm<packed_t>(M);
 
-  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+  // use K/2 for mxfp4 and K for fp8
+  const int64_t packed_K = get_row_size<packed_t>(K);
 
   // parallel on [MB, NB]
   AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
-    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
-      int64_t mb{0}, nb{0};
-      data_index_init(begin, mb, MB, nb, NB);
-
-      int tid = at::get_thread_num();
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+      int tid = get_thread_num();
       scalar_t* __restrict__ Btmp = buffer + tid * buffer_size_per_thread;
-      float* __restrict__ Ctmp = (float*)((void*)(Btmp + BLOCK_N * K));
+      float* __restrict__ Ctmp = (float*)((void*)(Btmp + MAX_CACHE_BLOCK_SIZE * BLOCK_N * K));
 
-      for (int64_t i = begin; i < end; ++i) {
-        UNUSED(i);
-        const float* scale_ptr = scales2 + (nb / blocks_n_per_group) * scale_size_K;
+      loop_2d<packed_t>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+        const param_t* scale_ptr = scales2 + scale_offset_per_block(nb);
 
         int64_t mb_start = mb * BLOCK_M;
         int64_t mb_size = std::min(M - mb_start, BLOCK_M);
         int64_t nb_start = nb * BLOCK_N;
         int64_t nb_size = std::min(N - nb_start, BLOCK_N);
 
-        tinygemm_kernel<scalar_t, has_bias>(
+        // only do unpacking for the first row
+        bool do_unpack = (mb == mb0);
+
+        tinygemm_kernel<scalar_t, packed_t, param_t, has_bias>(
             /*   A            */ mat1 + mb_start * mat1_strideM,
-            /*   B            */ mat2 + nb_start * K, // nb * BLOCK_N * K
+            /*   B            */ mat2 + nb_start * packed_K,  // nb * BLOCK_N * K
             /*   C            */ out + mb_start * out_strideM + nb_start,
-            /*   Btmp         */ Btmp,
+            /*   Btmp         */ Btmp + nb_offset * BLOCK_N * K,
             /*   Ctmp         */ Ctmp,
             /*   scale        */ scale_ptr,
             /*   bias         */ bias + nb_start,
@@ -391,11 +897,9 @@ void fp8_scaled_mm_kernel_impl(
             /*   ldb          */ nb_size,
             /*   ldc          */ out_strideM,
             /*   brg          */ use_brgemm,
-            /*   block_size_K */ block_size_K);
-
-        // move to the next index
-        data_index_step(mb, MB, nb, NB);
-      }
+            /*   block_size_K */ block_size_K,
+            /*   do_unpack    */ do_unpack);
+      });
 
       if (use_brgemm) {
         at::native::cpublas::brgemm_release();
@@ -404,7 +908,7 @@ void fp8_scaled_mm_kernel_impl(
   });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // tinygemm interface
 template <typename scalar_t>
@@ -422,42 +926,126 @@ void tinygemm_kernel(
     int64_t ldb,
     int64_t ldc,
     bool brg,
-    int64_t block_size_K) {
-  tinygemm_kernel<scalar_t, false>(A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K);
+    int64_t block_size_K,
+    bool do_unpack) {
+  tinygemm_kernel<scalar_t, at::Float8_e4m3fn, float, false>(
+      A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K, do_unpack);
 }
 
-#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)    \
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const at::Float8_e4m3fn* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    float scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
+  tinygemm_kernel2<scalar_t>(A, B, C, Btmp, Ctmp, scale, M, N, K, lda, ldb, ldc, brg);
+}
+
+template <typename scalar_t>
+void tinygemm_kernel(
+    const scalar_t* __restrict__ A,
+    const uint8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    scalar_t* __restrict__ Btmp,
+    float* __restrict__ Ctmp,
+    const uint8_t* __restrict__ scale,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg,
+    int64_t block_size_K,
+    bool do_unpack) {
+  tinygemm_kernel<scalar_t, uint8_t, uint8_t, false>(
+      A, B, C, Btmp, Ctmp, scale, nullptr, M, N, K, lda, ldb, ldc, brg, block_size_K, do_unpack);
+}
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE_A, TYPE_B, TYPE_S) \
+  template void tinygemm_kernel<TYPE_A>(                      \
+      const TYPE_A* __restrict__ A,                           \
+      const TYPE_B* __restrict__ B,                           \
+      TYPE_A* __restrict__ C,                                 \
+      TYPE_A* __restrict__ Btmp,                              \
+      float* __restrict__ Ctmp,                               \
+      const TYPE_S* __restrict__ scale,                       \
+      int64_t M,                                              \
+      int64_t N,                                              \
+      int64_t K,                                              \
+      int64_t lda,                                            \
+      int64_t ldb,                                            \
+      int64_t ldc,                                            \
+      bool brg,                                               \
+      int64_t block_size_K,                                   \
+      bool do_unpack)
+
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16, at::Float8_e4m3fn, float);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half, at::Float8_e4m3fn, float);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16, uint8_t, uint8_t);
+INSTANTIATE_TINYGEMM_TEMPLATE(at::Half, uint8_t, uint8_t);
+
+#define INSTANTIATE_TINYGEMM_TEMPLATE2(TYPE)   \
   template void tinygemm_kernel<TYPE>(         \
       const TYPE* __restrict__ A,              \
       const at::Float8_e4m3fn* __restrict__ B, \
       TYPE* __restrict__ C,                    \
       TYPE* __restrict__ Btmp,                 \
       float* __restrict__ Ctmp,                \
-      const float* __restrict__ scale,         \
+      float scale,                             \
       int64_t M,                               \
       int64_t N,                               \
       int64_t K,                               \
       int64_t lda,                             \
       int64_t ldb,                             \
       int64_t ldc,                             \
-      bool brg,                                \
-      int64_t block_size_K)
+      bool brg)
 
-INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
-INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
+INSTANTIATE_TINYGEMM_TEMPLATE2(at::BFloat16);
 
-at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
-    std::vector<int64_t> block_size, std::optional<at::Tensor>& bias,
-    at::ScalarType out_dtype, bool is_vnni) {
-  RECORD_FUNCTION("sgl-kernel::fp8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, block_size, bias}));
+inline const float* get_bias_data(const std::optional<at::Tensor>& bias, int64_t N) {
+  if (bias.has_value()) {
+    const auto& bias_ref = bias.value();
+    CHECK_EQ(bias_ref.size(0), N);
+    return bias_ref.data_ptr<float>();
+  }
+  return nullptr;
+}
 
+// FP8 and MXFP4 WoQ uses the same pattern:
+//   Btmp : [T, BLOCK_N * K]
+//   Ctmp : [T, BLOCK_M * BLOCK_N]
+inline at::Tensor alloc_thread_buffer(const at::TensorOptions& options, int64_t K) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  int num_threads = at::get_num_threads();
+  int64_t size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * K + BLOCK_M * BLOCK_N * 2;
+  return at::empty({num_threads, size_per_thread}, options);
+}
+
+at::Tensor fp8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    std::vector<int64_t> block_size,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
   auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
 
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
   CHECK_INPUT(mat2);
   CHECK_INPUT(scales2);
-  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
-      "fp8_scaled_mm_cpu: expect scales2 to be float32.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "fp8_scaled_mm_cpu: expect scales2 to be float32.");
 
   int64_t M = mat1.size(0);
   int64_t N = mat2.size(0);
@@ -467,13 +1055,10 @@ at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& sca
   CHECK_DIM(2, mat1);
   CHECK_DIM(2, mat2);
 
-  TORCH_CHECK(block_size.size() == 2,
-      "fp8_scaled_mm_cpu: expect block_size.size() to be 2.");
-
+  TORCH_CHECK(block_size.size() == 2, "fp8_scaled_mm_cpu: expect block_size.size() to be 2.");
   int64_t block_size_N = block_size[0];
   int64_t block_size_K = block_size[1];
 
-  constexpr int64_t BLOCK_M = block_size_m() * BLOCK_SIZE_M_SCALE;
   constexpr int64_t BLOCK_N = block_size_n();
   TORCH_CHECK(block_size_N % BLOCK_N == 0, "fp8_scaled_mm_cpu: expect block_size_N to be multiples of BLOCK_N");
   TORCH_CHECK(block_size_K == BLOCK_K, "fp8_scaled_mm_cpu: expect block_size_K equals to BLOCK_K");
@@ -481,49 +1066,94 @@ at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& sca
   CHECK_EQ(scales2.size(1), div_up(K, block_size_K));
 
   const auto st = mat1.scalar_type();
-  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
-      "fp8_scaled_mm_cpu: expect A to be bfloat16 or half.");
-  TORCH_CHECK(st == out_dtype,
-      "fp8_scaled_mm_cpu: expect A has same dtype with out_dtype.");
-  TORCH_CHECK(mat2.scalar_type() == at::kFloat8_e4m3fn,
-      "fp8_scaled_mm_cpu: expect mat2 to be fp8_e4m3.");
-  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
-      "fp8_scaled_mm_cpu: expect scales to be float32.");
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "fp8_scaled_mm_cpu: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype, "fp8_scaled_mm_cpu: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kFloat8_e4m3fn, "fp8_scaled_mm_cpu: expect mat2 to be fp8_e4m3.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "fp8_scaled_mm_cpu: expect scales to be float32.");
   auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
 
-  // strides
-  int64_t mat1_strideM = mat1.stride(0);
-  int64_t out_strideM = out.stride(0);
-
-  const bool has_bias = bias.has_value();
-  const float* bias_data = nullptr;
-  if (has_bias) {
-    CHECK_EQ(bias.value().size(0), N);
-    bias_data = bias.value().data_ptr<float>();
-  }
-
-  // Btmp : [T, BLOCK_N * K]
-  // Ctmp : [T, BLOCK_M * BLOCK_N]
-  int num_threads = at::get_num_threads();
-  int64_t size_per_thread = BLOCK_N * K + BLOCK_M * BLOCK_N * 2;
-  auto buffer = at::empty({num_threads, size_per_thread}, mat1.options());
+  auto buffer = alloc_thread_buffer(mat1.options(), K);
 
   AT_DISPATCH_REDUCED_FLOATING_TYPES(out_dtype, "fp8_scaled_mm_kernel_impl", [&] {
-    fp8_scaled_mm_kernel_impl<scalar_t>(
+    // used for lambda computing scale offset for each block
+    //   fp8 block gemm sale shape: [N/128, K/128]
+    //   for each block: [1, K/128]
+    const int64_t scale_size_K = div_up(K, block_size_K);
+    const int64_t blocks_n_per_group = block_size_N / BLOCK_N;
+
+    fp_scaled_mm_kernel_impl<scalar_t, at::Float8_e4m3fn, float>(
         out.data_ptr<scalar_t>(),
         mat1.data_ptr<scalar_t>(),
         packed_w.data_ptr<at::Float8_e4m3fn>(),
         scales2.data_ptr<float>(),
-        bias_data,
+        get_bias_data(bias, N),
         buffer.data_ptr<scalar_t>(),
         M,
         N,
         K,
-        mat1_strideM,
-        out_strideM,
+        mat1.stride(0),
+        out.stride(0),
         block_size_N,
         block_size_K,
-        size_per_thread);
+        buffer.size(-1),
+        [&](int64_t nb) { return (nb / blocks_n_per_group) * scale_size_K; });
+  });
+
+  return out;
+}
+
+// mat1 : [M, K] bfloat16
+// mat2 : [N, K / 2] uint8, actual layout: [N / BLOCK_N, K / 2, BLOCK_N, 2]
+// scales2: [N, K / G], actual layout: [N / BLOCK_N, K / G, BLOCK_N]
+at::Tensor mxfp4_scaled_mm_cpu(
+    at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2, const std::optional<at::Tensor>& bias, bool is_vnni) {
+  auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
+
+  CHECK_INPUT(mat1);
+  CHECK_INPUT(mat2);
+  CHECK_INPUT(scales2);
+
+  int64_t M = mat1.size(0);
+  int64_t N = mat2.size(0);
+  int64_t K = mat2.size(1) * 2;
+
+  // mxfp4 supports only group size of 32 (2^5)
+  constexpr int64_t group_size = 32;
+  constexpr int64_t BLOCK_N = block_size_n();
+
+  CHECK_EQ(mat1.size(1), K);
+  CHECK_EQ(scales2.numel(), N * K >> 5);
+
+  const auto st = mat1.scalar_type();
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "mxfp4_scaled_mm_cpu: expect A to be bfloat16 or half.");
+  TORCH_CHECK(mat2.scalar_type() == at::kByte, "mxfp4_scaled_mm_cpu: expect mat2 to be uint8.");
+  TORCH_CHECK(scales2.scalar_type() == at::kByte, "mxfp4_scaled_mm_cpu: expect scales to be uint8.");
+  auto out = at::empty({M, N}, mat1.options());
+
+  auto buffer = alloc_thread_buffer(mat1.options(), K);
+
+  AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "mxfp4_scaled_mm_kernel_impl", [&] {
+    // used for lambda computing scale offset for each block
+    //   mxfp4 block gemm sale shape: [N/BLOCK_N, K/32, BLOCK_N]
+    //   for each block: [K/32, BLOCK_N]
+    const int64_t s_strideN = (K >> 5) * BLOCK_N;
+
+    fp_scaled_mm_kernel_impl<scalar_t, uint8_t, uint8_t>(
+        out.data_ptr<scalar_t>(),
+        mat1.data_ptr<scalar_t>(),
+        packed_w.data_ptr<uint8_t>(),
+        scales2.data_ptr<uint8_t>(),
+        get_bias_data(bias, N),
+        buffer.data_ptr<scalar_t>(),
+        M,
+        N,
+        K,
+        mat1.stride(0),
+        out.stride(0),
+        /* block_size_N */ 1,
+        /* block_size_K */ group_size,
+        buffer.size(-1),
+        [&](int64_t nb) { return nb * s_strideN; });
   });
 
   return out;
diff --git a/csrc/cpu/sgl-kernels/gemm_int4.cpp b/csrc/cpu/sgl-kernels/gemm_int4.cpp
index 4a04c50660e2..5b66b2a5aee7 100644
--- a/csrc/cpu/sgl-kernels/gemm_int4.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int4.cpp
@@ -1,10 +1,10 @@
-// SPDX-License-Identifier: Apache-2.0
-// Adapted from sgl-project/sglang
-// https://github.com/sgl-project/sglang/pull/8226
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
-#include <ATen/ATen.h>
+// clang-format off
+
+#include <torch/all.h>
 
-#include "common.h"
 #include "gemm.h"
 #include "vec.h"
 
@@ -24,19 +24,22 @@ struct ActDtype<false> {
   using type = uint8_t;
 };
 
+#if defined(CPU_CAPABILITY_AVX512)
 struct alignas(32) m256i_wrapper {
   __m256i data;
 };
 
-#if defined(CPU_CAPABILITY_AVX512)
-inline std::array<m256i_wrapper, 2> load_zps_4vnni(
-    const int8_t* __restrict__ zps) {
-  __m256i vzps_low = _mm256_set1_epi64x(*reinterpret_cast<const int64_t*>(zps));
-  __m256i vzps_high =
-      _mm256_set1_epi64x(*reinterpret_cast<const int64_t*>(zps + 8));
+inline std::array<m256i_wrapper, 2> load_zps_4vnni(const int8_t* __restrict__ zps) {
+  // broadcast 01234567 to
+  // 01234567012345670123456701234567
+  __m256i vzps_low = _mm256_set1_epi64x(*reinterpret_cast<const long*>(zps));
+  __m256i vzps_high = _mm256_set1_epi64x(*reinterpret_cast<const long*>(zps + 8));
+  // shuffle from
+  // 01234567012345670123456701234567
+  // to
+  // 00001111222233334444555566667777
   __m256i shuffle_mask =
-      _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
-                      3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+      _mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
   vzps_low = _mm256_shuffle_epi8(vzps_low, shuffle_mask);
   vzps_high = _mm256_shuffle_epi8(vzps_high, shuffle_mask);
   m256i_wrapper vzps_low_wp, vzps_high_wp;
@@ -45,8 +48,7 @@ inline std::array<m256i_wrapper, 2> load_zps_4vnni(
   return {vzps_low_wp, vzps_high_wp};
 }
 
-inline std::array<m256i_wrapper, 2> load_uint4_as_int8(
-    const uint8_t* __restrict__ qB) {
+inline std::array<m256i_wrapper, 2> load_uint4_as_int8(const uint8_t* __restrict__ qB) {
   __m256i packed = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(qB));
   const __m256i low_mask = _mm256_set1_epi8(0x0f);
   __m256i high = _mm256_srli_epi16(packed, 4);
@@ -58,37 +60,42 @@ inline std::array<m256i_wrapper, 2> load_uint4_as_int8(
   return {low_wp, high_wp};
 }
 
-template <int N, int ldb>
-void _dequant_weight_zp_only(const uint8_t* __restrict__ B, int8_t* dqB,
-                             const int8_t* __restrict__ qzeros, int64_t K) {
-  #pragma GCC unroll 2
+template <int64_t N, int64_t ldb>
+void _dequant_weight_zp_only(const uint8_t* __restrict__ B, int8_t* dqB, const int8_t* __restrict__ qzeros, int64_t K) {
+  // unpack weight int8 -> two int4
+  // subtract zero point
+  // B shape = [K, ldb] = [K, N / 2], actual shape = [K / 4, N / 2, 4]
+  // dqB shape = [K, N], actual shape = [K / 4, N, 4]
+#pragma GCC unroll 2
   for (int n = 0; n < N; n += 16) {
     auto [zps_low_wp, zps_high_wp] = load_zps_4vnni(&qzeros[n]);
     auto zps_low = zps_low_wp.data;
     auto zps_high = zps_high_wp.data;
     for (int k = 0; k < K; k += 4) {
-      auto [vb_low_wp, vb_high_wp] =
-          load_uint4_as_int8(B + ldb * k + n / 2 * 4);
+      auto [vb_low_wp, vb_high_wp] = load_uint4_as_int8(B + ldb * k + n / 2 * 4);
       auto vb_low = vb_low_wp.data;
       auto vb_high = vb_high_wp.data;
       vb_high = _mm256_sub_epi8(vb_high, zps_high);
       vb_low = _mm256_sub_epi8(vb_low, zps_low);
-      _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(dqB + N * k + n * 4),
-                          vb_low);
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i_u*>(dqB + N * k + (n + 8) * 4), vb_high);
+      // store vb to B
+      _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(dqB + N * k + n * 4), vb_low);
+      _mm256_storeu_si256(reinterpret_cast<__m256i_u*>(dqB + N * k + (n + 8) * 4), vb_high);
     }
   }
 }
 
-template <bool sym_quant_act, int N, bool accum>
-void _dequant_and_store(float* __restrict__ output,
-                        const int32_t* __restrict__ input,
-                        const float* __restrict__ scale_a,
-                        const int32_t* __restrict__ zp_a,
-                        const float* __restrict__ scale_b,
-                        const int32_t* __restrict__ comp_b, int M, int ldi,
-                        int ldo, int ldsa = 1) {
+template <bool accum, int64_t N, bool sym_quant_act>
+void _dequant_and_store(
+    float* __restrict__ output,
+    const int32_t* __restrict__ input,
+    const float* __restrict__ scale_a,
+    const int32_t* __restrict__ zp_a,
+    const float* __restrict__ scale_b,
+    const int32_t* __restrict__ comp_b,
+    int M,
+    int ldi,
+    int ldo,
+    int ldsa = 1) {
   for (int m = 0; m < M; ++m) {
     float a_scale = *(scale_a + m * ldsa);
     __m512 va_scale = _mm512_set1_ps(a_scale);
@@ -99,7 +106,7 @@ void _dequant_and_store(float* __restrict__ output,
       va_zp = _mm512_set1_epi32(a_zp);
     }
     int n = 0;
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
     for (; n < N; n += 16) {
       __m512i vc = _mm512_loadu_si512(input + m * ldi + n);
       if constexpr (!sym_quant_act) {
@@ -122,8 +129,7 @@ void _dequant_and_store(float* __restrict__ output,
       if constexpr (sym_quant_act) {
         dq_val = (float)input[m * ldi + n] * a_scale * scale_b[n];
       } else {
-        dq_val = (float)(input[m * ldi + n] - a_zp * comp_b[n]) * a_scale *
-                 scale_b[n];
+        dq_val = (float)(input[m * ldi + n] - a_zp * comp_b[n]) * a_scale * scale_b[n];
       }
       if constexpr (accum) {
         output[m * ldo + n] += dq_val;
@@ -135,9 +141,10 @@ void _dequant_and_store(float* __restrict__ output,
 }
 
 #else
-template <int N, int ldb>
-void _dequant_weight_zp_only(const uint8_t* B, int8_t* dqB,
-                             const int8_t* qzeros, int64_t K) {
+template <int64_t N, int64_t ldb>
+void _dequant_weight_zp_only(const uint8_t* B, int8_t* dqB, const int8_t* qzeros, int64_t K) {
+  // B shape = [K, N / 2]
+  // dqB shape = [K, N]
   for (int k = 0; k < K; ++k) {
     for (int n = 0; n < N / 2; ++n) {
       int32_t b = (int32_t)B[k * ldb + n];
@@ -158,20 +165,31 @@ inline __m512i combine_m256i(std::array<m256i_wrapper, 2> two_256) {
   return combine_m256i(two_256[0].data, two_256[1].data);
 }
 
+// negate elements in a according to b's sign
 static inline __m512i _mm512_sign_epi8(__m512i a, __m512i b) {
   __m512i zero = _mm512_setzero_si512();
   __mmask64 blt0 = _mm512_movepi8_mask(b);
   return _mm512_mask_sub_epi8(a, blt0, zero, a);
 }
 
-template <bool sym_quant_act, int M, int N, int ldb>
-void _dequant_gemm_accum_small_M(float* __restrict__ C, const uint8_t* A,
-                                 const float* scales_a, const int32_t* qzeros_a,
-                                 const uint8_t* B, const float* scales_b,
-                                 const int8_t* qzeros_b, int64_t K, int64_t lda,
-                                 int64_t ldc) {
+template <int64_t M, int64_t N, int64_t ldb, bool sym_quant_act>
+void _dequant_gemm_accum_small_M(
+    float* __restrict__ C,
+    const uint8_t* A,
+    const float* scales_a,
+    const int32_t* qzeros_a,
+    const uint8_t* B,
+    const float* scales_b,
+    const int8_t* qzeros_b,
+    int64_t K,
+    int64_t lda,
+    int64_t ldc) {
+  // if sym_quant_act is true, A pointer type is passed in as uint8_t* but actually int8_t*.
+
   constexpr int COLS = N / 16;
-  __m512i ones = _mm512_set1_epi8(1);
+  // Computing compensation is faster than loading it for small M
+  // because it's memory bound.
+  __m512i ones = _mm512_set1_epi8(1);  // used for computing compensation
   __m512i va;
   __m512i vb[COLS];
   __m512i vc[M * COLS];
@@ -179,6 +197,7 @@ void _dequant_gemm_accum_small_M(float* __restrict__ C, const uint8_t* A,
   __m512i vzps[COLS];
   __m512i vcompensate[COLS];
 
+  // Load scales and zps
   Unroll<COLS>{}([&](auto i) {
     vscales[i] = _mm512_loadu_ps(scales_b + i * 16);
     vzps[i] = combine_m256i(load_zps_4vnni(qzeros_b + i * 16));
@@ -214,25 +233,25 @@ void _dequant_gemm_accum_small_M(float* __restrict__ C, const uint8_t* A,
     }
   };
 
+  // Accumulate along k
   constexpr const int unroll = 4;
   int k = 0;
   for (; k < K / 4 / unroll; k++) {
-    Unroll<unroll>{}(
-        [&](auto i) { Unroll<M * COLS>{}(compute, 4 * (k * unroll + i)); });
+    Unroll<unroll>{}([&](auto i) { Unroll<M * COLS>{}(compute, 4 * (k * unroll + i)); });
   }
   k *= 4 * unroll;
   for (; k < K; k += 4) {
     Unroll<M * COLS>{}(compute, k);
   }
 
+  // Store to C
   auto store = [&](auto i) {
     constexpr const int row = i / COLS;
     constexpr const int col = i % COLS;
+    // compute (qC - compensate * zp_a) * scale_a * scale_b
     __m512 vc_float;
     if constexpr (!sym_quant_act) {
-      vc[i] = _mm512_sub_epi32(
-          vc[i], _mm512_mullo_epi32(vcompensate[col],
-                                    _mm512_set1_epi32(*(qzeros_a + row))));
+      vc[i] = _mm512_sub_epi32(vc[i], _mm512_mullo_epi32(vcompensate[col], _mm512_set1_epi32(*(qzeros_a + row))));
     }
     vc_float = _mm512_cvtepi32_ps(vc[i]);
     vc_float = _mm512_mul_ps(vc_float, _mm512_set1_ps(*(scales_a + row)));
@@ -245,17 +264,28 @@ void _dequant_gemm_accum_small_M(float* __restrict__ C, const uint8_t* A,
   Unroll<M * COLS>{}(store);
 }
 
-  #define CALL_DEQUANT_GEMM_ACCUM_SMALL_M(M)               \
-    _dequant_gemm_accum_small_M<sym_quant_act, M, N, ldb>( \
-        C, A, scales_a, qzeros_a, B, scales_b, qzeros_b, K, lda, ldc);
+#define CALL_DEQUANT_GEMM_ACCUM_SMALL_M(M) \
+  _dequant_gemm_accum_small_M<M, N, ldb, sym_quant_act>(C, A, scales_a, qzeros_a, B, scales_b, qzeros_b, K, lda, ldc);
 #endif
 
-template <bool sym_quant_act, int N, int ldb>
-void _dequant_gemm_accum(float* C, const uint8_t* A, const float* scales_a,
-                         const int32_t* qzeros_a, const uint8_t* B,
-                         const float* scales_b, const int8_t* qzeros_b,
-                         const int32_t* compensation, int8_t* dqB, int64_t M,
-                         int64_t K, int64_t lda, int64_t ldc, bool use_brgemm) {
+template <int64_t N, int64_t ldb, bool sym_quant_act>
+void _dequant_gemm_accum(
+    float* C,
+    const uint8_t* A,
+    const float* scales_a,
+    const int32_t* qzeros_a,
+    const uint8_t* B,
+    const float* scales_b,
+    const int8_t* qzeros_b,
+    const int32_t* compensation,
+    int8_t* dqB,
+    int64_t M,
+    int64_t K,
+    int64_t lda,
+    int64_t ldc,
+    bool use_brgemm) {
+  // Compute GEMM int8 * int8 -> int32
+  // dequant result to float by applying scales/qzeros
 #if defined(CPU_CAPABILITY_AVX512)
   if (!use_brgemm) {
     switch (M) {
@@ -282,14 +312,12 @@ void _dequant_gemm_accum(float* C, const uint8_t* A, const float* scales_a,
   Tin* A_ptr = (Tin*)A;
   if (use_brgemm) {
     int32_t C_i32[M * N];
-    at::native::cpublas::brgemm(M, N, K, lda, N /*ldb*/, N /*ldc*/,
-                                false /* add_C */, A_ptr, dqB, C_i32,
-                                true /* is_vnni */);
+    at::native::cpublas::brgemm(
+        M, N, K, lda, N /*ldb*/, N /*ldc*/, false /* add_C */, A_ptr, dqB, C_i32, true /* is_vnni */);
     _mm_prefetch(B + N * K / 2, _MM_HINT_T0);
     _mm_prefetch(A + K, _MM_HINT_T0);
-    _dequant_and_store<sym_quant_act, N, true>(C, C_i32, scales_a, qzeros_a,
-                                               scales_b, compensation, M,
-                                               N /*ldi*/, ldc, 1 /*ldsa*/);
+    _dequant_and_store<true, N, sym_quant_act>(
+        C, C_i32, scales_a, qzeros_a, scales_b, compensation, M, N /*ldi*/, ldc, 1 /*ldsa*/);
   } else
 #endif
   {
@@ -297,13 +325,13 @@ void _dequant_gemm_accum(float* C, const uint8_t* A, const float* scales_a,
   }
 }
 
-template <int N>
+template <int64_t N>
 inline void copy_bias(const float* bias_ptr, float* y_buf, int64_t m) {
   if (bias_ptr) {
     for (int i = 0; i < m; ++i) {
       int j = 0;
 #if defined(CPU_CAPABILITY_AVX512)
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
       for (; j < N; j += 16) {
         __m512 bias_vec = _mm512_loadu_ps(bias_ptr + j);
         _mm512_storeu_ps(y_buf + i * N + j, bias_vec);
@@ -313,11 +341,11 @@ inline void copy_bias(const float* bias_ptr, float* y_buf, int64_t m) {
         y_buf[i * N + j] = bias_ptr[j];
       }
     }
-  } else {
+  } else {  // initialize to zero
     for (int i = 0; i < m; ++i) {
       int j = 0;
 #if defined(CPU_CAPABILITY_AVX512)
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
       for (; j < N; j += 16) {
         __m512 zero_vec = _mm512_setzero_ps();
         _mm512_storeu_ps(y_buf + i * N + j, zero_vec);
@@ -330,14 +358,13 @@ inline void copy_bias(const float* bias_ptr, float* y_buf, int64_t m) {
   }
 }
 
-template <int N, typename out_dtype>
-inline void store_out(const float* y_buf, out_dtype* c_ptr, int64_t m,
-                      int64_t lda) {
+template <typename out_dtype, int64_t N>
+inline void store_out(const float* y_buf, out_dtype* c_ptr, int64_t m, /* int64_t n, */ int64_t lda) {
   for (int i = 0; i < m; ++i) {
     int j = 0;
     if constexpr (std::is_same<out_dtype, float>::value) {
 #if defined(CPU_CAPABILITY_AVX512)
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
       for (; j < N; j += 16) {
         __m512 y_vec = _mm512_loadu_ps(y_buf + i * N + j);
         _mm512_storeu_ps(c_ptr + i * lda + j, y_vec);
@@ -348,12 +375,11 @@ inline void store_out(const float* y_buf, out_dtype* c_ptr, int64_t m,
       }
     } else if constexpr (std::is_same<out_dtype, at::BFloat16>::value) {
 #if defined(CPU_CAPABILITY_AVX512)
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
       for (; j < N; j += 16) {
         __m512 y_vec = _mm512_loadu_ps(y_buf + i * N + j);
         __m256i y_bf16_vec = at::vec::cvtfp32_bf16(y_vec);
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c_ptr + i * lda + j),
-                            y_bf16_vec);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c_ptr + i * lda + j), y_bf16_vec);
       }
 #endif
       for (; j < N; ++j) {
@@ -361,12 +387,11 @@ inline void store_out(const float* y_buf, out_dtype* c_ptr, int64_t m,
       }
     } else if constexpr (std::is_same<out_dtype, at::Half>::value) {
 #if defined(CPU_CAPABILITY_AVX512)
-  #pragma GCC unroll 2
+#pragma GCC unroll 2
       for (; j < N; j += 16) {
         __m512 y_vec = _mm512_loadu_ps(y_buf + i * N + j);
         __m256i y_fp16_vec = at::vec::cvtfp32_fp16(y_vec);
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c_ptr + i * lda + j),
-                            y_fp16_vec);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(c_ptr + i * lda + j), y_fp16_vec);
       }
 #endif
       for (; j < N; ++j) {
@@ -392,16 +417,25 @@ void fill_val_stub(int32_t* __restrict__ output, int32_t value, int64_t size) {
   }
 }
 
-template <bool sym_quant_act, typename act_dtype, typename out_dtype>
+template <typename act_dtype, typename out_dtype, bool sym_quant_act>
 void _da8w4_linear_impl(
-    act_dtype* __restrict__ input, const float* __restrict__ input_scales,
+    act_dtype* __restrict__ input,
+    const float* __restrict__ input_scales,
     const int32_t* __restrict__ input_qzeros,
-    const uint8_t* __restrict__ weight, const float* __restrict__ weight_scales,
-    const int8_t* __restrict__ weight_qzeros, const float* __restrict__ bias,
-    out_dtype* __restrict__ output, float* __restrict__ output_temp,
-    int8_t* __restrict__ dequant_weight_temp, int64_t M, int64_t N, int64_t K,
+    const uint8_t* __restrict__ weight,
+    const float* __restrict__ weight_scales,
+    const int8_t* __restrict__ weight_qzeros,
+    const float* __restrict__ bias,
+    out_dtype* __restrict__ output,
+    float* __restrict__ output_temp,
+    int8_t* __restrict__ dequant_weight_temp,
+    int64_t M,
+    int64_t N,
+    int64_t K,
     int64_t num_groups) {
-  const bool use_brgemm = can_use_brgemm<act_dtype>(M);
+  // weight + compensation shape = [Nc, Kc, BLOCK_N * _block_k / 2 + BLOCK_N*sizeof(int32_t)]
+  // scales/qzeros shape = [Nc, G, BLOCK_N]
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
   int64_t block_m = [&]() -> long {
     if (M <= 48) {
       return M;
@@ -432,30 +466,24 @@ void _da8w4_linear_impl(
       int64_t mc_end = parallel_on_M ? mc + 1 : Mc;
 
       for (int mci = mc; mci < mc_end; ++mci) {
-        int64_t m_size =
-            mci * block_m + block_m > M ? M - mci * block_m : block_m;
+        int64_t m_size = mci * block_m + block_m > M ? M - mci * block_m : block_m;
+        // copy bias to y_buf if bias is not None
         auto bias_data = bias ? bias + nc * BLOCK_N : nullptr;
         copy_bias<BLOCK_N>(bias_data, C_tmp, m_size);
         for (int kci = 0; kci < Kc; ++kci) {
           int32_t* compensation_ptr =
               sym_quant_act
                   ? nullptr
-                  : (int32_t*)(void*)(weight +
-                                      (nc * Kc + kci) *
-                                          (BLOCK_N *
-                                           (_block_k / 2 + sizeof(int32_t))) +
-                                      _block_k * BLOCK_N / 2);
-          _dequant_gemm_accum<sym_quant_act, BLOCK_N, BLOCK_N / 2>(
+                  : (int32_t*)(void*)(weight + (nc * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) +
+                                      _block_k * BLOCK_N / 2) /*Bcomp*/;
+          _dequant_gemm_accum<BLOCK_N, BLOCK_N / 2, sym_quant_act>(
               /*C*/ C_tmp,
               /*A*/ (uint8_t*)input + mci * block_m * K + kci * _block_k,
               /*scales_a*/ input_scales + mci * block_m,
               /*qzeros_a*/ input_qzeros + mci * block_m,
-              /*B*/ weight + (nc * Kc + kci) *
-                                 (BLOCK_N * (_block_k / 2 + sizeof(int32_t))),
-              /*scales_b*/ weight_scales + nc * BLOCK_N * num_groups +
-                  kci / block_per_group * BLOCK_N,
-              /*qzeros_b*/ weight_qzeros + nc * BLOCK_N * num_groups +
-                  kci / block_per_group * BLOCK_N,
+              /*B*/ weight + (nc * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))),
+              /*scales_b*/ weight_scales + nc * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N,
+              /*qzeros_b*/ weight_qzeros + nc * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N,
               /*Bcomp*/ compensation_ptr,
               /*dqB_tmp*/ dqB_tmp,
               /*M*/ m_size,
@@ -464,8 +492,8 @@ void _da8w4_linear_impl(
               /*ldc*/ BLOCK_N,
               /*use_brgemm*/ use_brgemm);
         }
-        store_out<BLOCK_N>(C_tmp, output + mci * block_m * N + nc * BLOCK_N,
-                           m_size, N /*lda*/);
+        // store y_buf to output with dtype conversion
+        store_out<out_dtype, BLOCK_N>(C_tmp, output + mci * block_m * N + nc * BLOCK_N, m_size, N /*lda*/);
       }
     }
     if (use_brgemm) {
@@ -476,15 +504,16 @@ void _da8w4_linear_impl(
 
 }  // anonymous namespace
 
-std::tuple<at::Tensor, at::Tensor, at::Tensor>
-convert_int4_weight_packed_with_compensation(const at::Tensor& weight,
-                                             const at::Tensor& scales,
-                                             const at::Tensor& qzeros) {
-  TORCH_CHECK(weight.dim() == 2,
-              "DA8W4 CPU: Weight should be a 2D tensor for packing");
-  TORCH_CHECK(
-      weight.size(1) % 2 == 0,
-      "DA8W4 CPU: Weight should have even number of columns for packing");
+/*
+return: packed_weight, packed_scales, packed_qzeros
+*/
+std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_int4_weight_packed_with_compensation(
+    const at::Tensor& weight, const at::Tensor& scales, const at::Tensor& qzeros) {
+  // weight shape = [N, K]
+  // scales shape = [N, G]
+  // qzeros shape = [N, G]
+  TORCH_CHECK(weight.dim() == 2, "DA8W4 CPU: Weight should be a 2D tensor for packing");
+  TORCH_CHECK(weight.size(1) % 2 == 0, "DA8W4 CPU: Weight should have even number of columns for packing");
 
   auto new_scales = scales;
   auto new_qzeros = qzeros;
@@ -505,20 +534,21 @@ convert_int4_weight_packed_with_compensation(const at::Tensor& weight,
   int64_t Nc = N / block_n;
   int64_t Kc = K / _block_k;
 
+  // Reorder weight to [N/block_n, K/_block_k, _block_k, block_n]
+  // Reorder scales/qzeros to [N/block_n, G, block_n]
+  // weight + compensation shape = [Nc, Kc, block_n * _block_k / 2 + block_n*sizeof(int32_t)]
+  // scales/qzeros shape = [Nc, G, block_n]
   auto weight_view = weight.view({Nc, block_n, Kc, _block_k});
   at::Tensor weight_reordered = weight_view.permute({0, 2, 3, 1}).contiguous();
   at::Tensor blocked_weight;
-  at::Tensor blocked_scales =
-      new_scales.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
-  at::Tensor blocked_qzeros =
-      new_qzeros.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
-  auto weight_sub_qzero = weight.view({Nc, block_n, G, -1}).to(at::kInt) -
-                          new_qzeros.view({Nc, block_n, G, -1});
+  at::Tensor blocked_scales = new_scales.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
+  at::Tensor blocked_qzeros = new_qzeros.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
+  // Compensation = Σ(k)(W[k][n] - ZP[n]) for each block.
+  auto weight_sub_qzero = weight.view({Nc, block_n, G, -1}).to(at::kInt) - new_qzeros.view({Nc, block_n, G, -1});
   weight_sub_qzero = weight_sub_qzero.view({Nc, block_n, Kc, _block_k});
   at::Tensor compensation = weight_sub_qzero.sum(-1);
   compensation = compensation.permute({0, 2, 1}).contiguous().to(at::kInt);
-  int64_t buffer_size_nbytes =
-      _block_k * block_n / 2 + block_n * sizeof(int32_t);
+  int64_t buffer_size_nbytes = _block_k * block_n / 2 + block_n * sizeof(int32_t);
   blocked_weight = at::empty({Nc, Kc, buffer_size_nbytes}, weight.options());
 
   auto weight_ptr = weight_reordered.data_ptr<uint8_t>();
@@ -528,24 +558,25 @@ convert_int4_weight_packed_with_compensation(const at::Tensor& weight,
   at::parallel_for(0, num_blocks, 1, [&](int64_t begin, int64_t end) {
     for (const auto i : c10::irange(begin, end)) {
       auto in_ptr = weight_ptr + i * _block_k * block_n;
-      auto out_ptr =
-          blocked_weight_ptr + i * block_n * (_block_k / 2 + sizeof(int32_t));
+      auto out_ptr = blocked_weight_ptr + i * block_n * (_block_k / 2 + sizeof(int32_t));
       int32_t* comp_in_prt = compensation_ptr + i * block_n;
-      int32_t* comp_out_prt =
-          (int32_t*)(void*)(blocked_weight_ptr +
-                            i * block_n * (_block_k / 2 + sizeof(int32_t)) +
-                            _block_k * block_n / 2);
+      int32_t* comp_out_prt = (int32_t*)(void*)(blocked_weight_ptr + i * block_n * (_block_k / 2 + sizeof(int32_t)) +
+                                                _block_k * block_n / 2);
+      // Reorder weight block to VNNI4 and pack two lanes along N
+      // N=16 viewed as two lanes: a0, ...a7, b0, ...b7
+      // pack two lanes: [a0, b0], ..., [a7, b7]
+      // plain shape = [_block_k, block_n]
+      // packed shape = [_block_k / 4, block_n / 2, 4] viewed as [_block_k, block_n / 2]
       constexpr int n_group_size = 8;
       constexpr int vnni_size = 4;
-      constexpr int n_group = block_n / n_group_size;
+      constexpr int n_group = block_n / n_group_size;  // 4
       for (int nb = 0; nb < n_group; nb += 2) {
         for (int k = 0; k < _block_k; k += vnni_size) {
           for (int ni = 0; ni < n_group_size; ++ni) {
             for (int ki = 0; ki < vnni_size; ++ki) {
               int src_idx_1 = nb * n_group_size + ni + (k + ki) * block_n;
               int src_idx_2 = (nb + 1) * n_group_size + ni + (k + ki) * block_n;
-              int dst_idx = (nb / 2 * n_group_size + ni) * vnni_size +
-                            k * block_n / 2 + ki;
+              int dst_idx = (nb / 2 * n_group_size + ni) * vnni_size + k * block_n / 2 + ki;
               uint8_t src_1 = *(in_ptr + src_idx_1);
               uint8_t src_2 = *(in_ptr + src_idx_2);
               uint8_t dst = (src_1 & 0x0f) | ((src_2 & 0x0f) << 4);
@@ -554,39 +585,128 @@ convert_int4_weight_packed_with_compensation(const at::Tensor& weight,
           }
         }
       }
+      // compensation [block_n]
       for (int nb = 0; nb < block_n; nb++) {
         *(comp_out_prt + nb) = *(comp_in_prt + nb);
       }
     }
   });
 
-  return std::make_tuple(std::move(blocked_weight), std::move(blocked_scales),
-                         std::move(blocked_qzeros));
+  return std::make_tuple(std::move(blocked_weight), std::move(blocked_scales), std::move(blocked_qzeros));
 }
 
-std::tuple<at::Tensor, at::Tensor> autoawq_to_int4pack(at::Tensor qweight,
-                                                       at::Tensor qzeros) {
-  auto bitshifts = at::tensor({0, 4, 1, 5, 2, 6, 3, 7}, at::kInt) * 4;
-  auto qweight_unsq = qweight.unsqueeze(-1);
-  auto unpacked = at::bitwise_right_shift(qweight_unsq, bitshifts) & 0xF;
-  auto qweight_final = unpacked.flatten(-2).transpose(-1, -2).to(at::kByte);
+std::tuple<at::Tensor, at::Tensor> unpack_4bit_to_32bit_signed(const at::Tensor& qweight, const at::Tensor& qzeros) {
+  TORCH_CHECK(qweight.scalar_type() == at::kInt, "qweight must be int32");
+  TORCH_CHECK(qzeros.scalar_type() == at::kInt, "qzeros must be int32");
+  const auto W0 = qweight.size(0);
+  const auto W1 = qweight.size(1);
+  const auto Z0 = qzeros.size(0);
+  const auto Z1 = qzeros.size(1);
+
+  // unpacked_weights: (W0 * 8, W1), int8
+  auto unpacked_weights = at::zeros({W0 * 8, W1}, at::TensorOptions().dtype(at::kChar));
+  // unpacked_zeros: (Z0, Z1 * 8), int8
+  auto unpacked_zeros = at::zeros({Z0, Z1 * 8}, at::TensorOptions().dtype(at::kChar));
+
+  const int32_t* qw_ptr = qweight.data_ptr<int32_t>();
+  const int32_t* qz_ptr = qzeros.data_ptr<int32_t>();
+  int8_t* uw_ptr = unpacked_weights.data_ptr<int8_t>();
+  int8_t* uz_ptr = unpacked_zeros.data_ptr<int8_t>();
+
+  // ---- unpack qweight ----
+  for (int64_t row = 0; row < W0 * 8; ++row) {
+    const int i = row & 7;         // row % 8
+    const int src_row = row >> 3;  // row // 8
+    const int shift = 4 * i;
+    for (int64_t col = 0; col < W1; ++col) {
+      int32_t v = qw_ptr[src_row * W1 + col];
+      uw_ptr[row * W1 + col] = static_cast<int8_t>((v >> shift) & 0xF);
+    }
+  }
+  // ---- unpack qzeros ----
+  for (int64_t col = 0; col < Z1 * 8; ++col) {
+    const int i = col & 7;
+    const int src_col = col >> 3;
+    const int shift = 4 * i;
+
+    for (int64_t row = 0; row < Z0; ++row) {
+      int32_t v = qz_ptr[row * Z1 + src_col];
+      uz_ptr[row * (Z1 * 8) + col] = static_cast<int8_t>((v >> shift) & 0xF);
+    }
+  }
 
-  auto qzeros_unsq = qzeros.unsqueeze(-1);
-  auto qzeros_unpacked = at::bitwise_right_shift(qzeros_unsq, bitshifts) & 0xF;
-  auto qzeros_final = qzeros_unpacked.flatten(-2).to(at::kByte);
+  return std::make_tuple(unpacked_weights, unpacked_zeros + 1);
+}
 
-  return std::make_tuple(qweight_final, qzeros_final);
+std::tuple<at::Tensor, at::Tensor>
+autogptq_to_int4pack(const at::Tensor& qweight_tensor, const at::Tensor& qzeros_tensor) {
+  TORCH_CHECK(qweight_tensor.scalar_type() == at::kInt, "qweight_tensor must be int32");
+  TORCH_CHECK(qzeros_tensor.scalar_type() == at::kInt, "qzeros_tensor must be int32");
+  TORCH_CHECK(qweight_tensor.is_cpu(), "CPU only implementation");
+  if (qweight_tensor.dim() == 3) {
+    const int64_t B = qweight_tensor.size(0);
+    std::vector<at::Tensor> qweight_list;
+    std::vector<at::Tensor> qzeros_list;
+    qweight_list.reserve(B);
+    qzeros_list.reserve(B);
+    for (int64_t i = 0; i < B; ++i) {
+      auto outputs = unpack_4bit_to_32bit_signed(qweight_tensor[i], qzeros_tensor[i]);
+      at::Tensor unpacked_qweight = std::get<0>(outputs);
+      at::Tensor unpacked_qzeros = std::get<1>(outputs);
+      qweight_list.push_back(unpacked_qweight.transpose(0, 1).contiguous().to(at::kByte));
+      qzeros_list.push_back(unpacked_qzeros.contiguous().to(at::kByte));
+    }
+    return std::make_tuple(at::stack(qweight_list).detach(), at::stack(qzeros_list).detach());
+  }
+  auto outputs = unpack_4bit_to_32bit_signed(qweight_tensor, qzeros_tensor);
+  at::Tensor unpacked_qweight = std::get<0>(outputs);
+  at::Tensor unpacked_qzeros = std::get<1>(outputs);
+  at::Tensor return_qweight = unpacked_qweight.transpose(0, 1).contiguous().to(at::kByte);
+  at::Tensor return_qzeros = unpacked_qzeros.contiguous().to(at::kByte);
+  return std::make_tuple(return_qweight, return_qzeros);
+}
+
+std::tuple<at::Tensor, at::Tensor> int4pack(at::Tensor qweight, at::Tensor qzeros, int64_t quant_method_4bit) {
+  if (quant_method_4bit == CPUQuantAlgo::AWQ) {
+    // autoawq unpacking
+    qweight = qweight.contiguous();
+    qzeros = qzeros.contiguous();
+    // bitshifts: [0, 4, 1, 5, 2, 6, 3, 7] * 4
+    auto bitshifts = at::tensor({0, 4, 1, 5, 2, 6, 3, 7}, at::kInt) * 4;
+    auto qweight_unsq = qweight.unsqueeze(-1);  // [..., K, N/8, 1]
+    auto unpacked = (at::bitwise_right_shift(qweight_unsq, bitshifts) & 0xF).contiguous();
+    auto qweight_final = unpacked.flatten(-2).transpose(-1, -2).to(at::kByte).clone();
+    auto qzeros_unsq = qzeros.unsqueeze(-1);
+    auto qzeros_unpacked = (at::bitwise_right_shift(qzeros_unsq, bitshifts) & 0xF).contiguous();
+    auto qzeros_final = qzeros_unpacked.flatten(-2).to(at::kByte).clone();
+    return std::make_tuple(qweight_final, qzeros_final);
+  } else if (quant_method_4bit == CPUQuantAlgo::GPTQ) {
+    // autogptq unpacking
+    auto outputs = autogptq_to_int4pack(qweight, qzeros);
+    at::Tensor unpacked_qweight = std::get<0>(outputs);
+    at::Tensor unpacked_qzeros = std::get<1>(outputs);
+    return std::make_tuple(unpacked_qweight, unpacked_qzeros);
+  } else {
+    TORCH_CHECK(false, "CPU int4 pack only support AWQ or GPTQ...");
+  }
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
-    at::Tensor qweight, at::Tensor qzeros, at::Tensor scales) {
-  auto res = autoawq_to_int4pack(qweight, qzeros);
-  auto _qweight = std::get<0>(res);
-  auto _qzeros = std::get<1>(res);
+    at::Tensor qweight,  // awq: (*, K, N / 8)  ||  gptq: (*, K / 8, N) , int32
+    at::Tensor qzeros,   // awq: (*, K / group_size, N / 8) ||  gptq: (*, K / group_size, N / 8) , int32
+    at::Tensor scales,   // awq: (*, K / group_size, N) ||  gptq: (*, K / group_size, N) , bfloat16
+    int64_t quant_method_4bit) {
+  at::Tensor _qweight;
+  at::Tensor _qzeros;
+
+  auto res = int4pack(qweight, qzeros, quant_method_4bit);
+  _qweight = std::get<0>(res);
+  _qzeros = std::get<1>(res);
+
   auto _scales = scales;
-  _qzeros = _qzeros.transpose(-2, -1).contiguous();
+  _qzeros = _qzeros.transpose(-2, -1).contiguous();  // .T
   _scales = _scales.transpose(-2, -1).contiguous();
-  if (_qweight.dim() == 3) {
+  if (_qweight.dim() == 3) {  // Dim=3 for MOE packing, TODO: refine a unified loop
     int64_t E = _qweight.size(0);
     int64_t K = _qweight.size(2);
     int64_t G = _scales.size(2);
@@ -595,17 +715,12 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
     int64_t block_n = block_size_n();
     int64_t Nc = _qweight.size(1) / block_n;
     int64_t Kc = K / _block_k;
-    int64_t buffer_size_nbytes =
-        _block_k * block_n / 2 + block_n * sizeof(int32_t);
-    auto blocked_weight =
-        at::empty({E, Nc, Kc, buffer_size_nbytes}, _qweight.options());
-    auto blocked_scales =
-        at::empty({E, Nc, G, block_n}, _scales.options()).to(at::kFloat);
-    auto blocked_qzeros =
-        at::empty({E, Nc, G, block_n}, _qzeros.options()).to(at::kChar);
+    int64_t buffer_size_nbytes = _block_k * block_n / 2 + block_n * sizeof(int32_t);
+    auto blocked_weight = at::empty({E, Nc, Kc, buffer_size_nbytes}, _qweight.options());
+    auto blocked_scales = at::empty({E, Nc, G, block_n}, _scales.options()).to(at::kFloat);
+    auto blocked_qzeros = at::empty({E, Nc, G, block_n}, _qzeros.options()).to(at::kChar);
     for (int i = 0; i < _qweight.size(0); i++) {
-      auto res_ = convert_int4_weight_packed_with_compensation(
-          _qweight[i], _scales[i], _qzeros[i]);
+      auto res_ = convert_int4_weight_packed_with_compensation(_qweight[i], _scales[i], _qzeros[i]);
       blocked_weight[i] = std::get<0>(res_);
       blocked_scales[i] = std::get<1>(res_);
       blocked_qzeros[i] = std::get<2>(res_);
@@ -614,8 +729,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
     _scales = blocked_scales;
     _qzeros = blocked_qzeros;
   } else {
-    auto res_ = convert_int4_weight_packed_with_compensation(_qweight, _scales,
-                                                             _qzeros);
+    auto res_ = convert_int4_weight_packed_with_compensation(_qweight, _scales, _qzeros);
     _qweight = std::get<0>(res_);
     _scales = std::get<1>(res_);
     _qzeros = std::get<2>(res_);
@@ -624,89 +738,93 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
   return std::make_tuple(_qweight, _qzeros, _scales);
 }
 
-at::Tensor int4_scaled_mm_cpu_with_quant(const at::Tensor& input,
-                                         const at::Tensor& weight,
-                                         const at::Tensor& weight_scales,
-                                         const at::Tensor& weight_qzeros,
-                                         const std::optional<at::Tensor>& bias,
-                                         at::ScalarType output_dtype) {
-  RECORD_FUNCTION("vllm::int4_scaled_mm_cpu_with_quant",
-                  std::vector<c10::IValue>({input, weight}));
-
+at::Tensor int4_scaled_mm_cpu_with_quant(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& weight_scales,
+    const at::Tensor& weight_qzeros,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType output_dtype) {
   int64_t M_a = input.size(0);
   int64_t K_a = input.size(1);
   int64_t lda = input.stride(0);
 
   const auto st = input.scalar_type();
   TORCH_CHECK(
-      st == at::kBFloat16 || st == at::kHalf,
-      "int4_scaled_mm_cpu_with_quant: expect A to be bfloat16 or half.");
+      st == at::kBFloat16 || st == at::kHalf, "int4_scaled_mm_cpu_with_quant: expect A to be bfloat16 or half.");
 
-  constexpr bool sym_quant_act = false;
+  constexpr bool sym_quant_act = false;  // TODO: add sym quant path
   using Tin = typename ActDtype<sym_quant_act>::type;
-  int64_t act_buffer_size =
-      M_a * K_a + M_a * sizeof(float) + M_a * sizeof(int32_t);
-  auto act_buffer =
-      at::empty({act_buffer_size}, input.options().dtype(at::kByte));
+  int64_t act_buffer_size = /* act quant */ M_a * K_a +
+                            /* act scale */ M_a * sizeof(float) +
+                            /* act zp */ M_a * sizeof(int32_t);
+  auto act_buffer = at::empty({act_buffer_size}, input.options().dtype(at::kByte));
+  // asym path, activation quants into uint8_t
   auto Aq_data = act_buffer.data_ptr<uint8_t>();
   auto As_data = reinterpret_cast<float*>(Aq_data + M_a * K_a);
   auto Azp_data = reinterpret_cast<int32_t*>(As_data + M_a);
-  fill_val_stub(Azp_data, 128, M_a);
+  fill_val_stub(Azp_data, 128, M_a);  // sym_a s8s8 is unified to u8s8 with compensation (128)
 
   auto out_sizes = input.sizes().vec();
   int64_t N = weight_scales.size(0) * weight_scales.size(-1);
   out_sizes.back() = N;
   auto output = at::empty(out_sizes, input.options());
+  // weight + compensation shape = [Nc, Kc, BLOCK_N * _block_k / 2 + BLOCK_N*sizeof(int32_t)]
+  // scales/qzeros shape = [Nc, G, BLOCK_N]
   int64_t Nc = weight.size(0);
   int64_t Kc = weight.size(1);
   int64_t _block_k = K_a / Kc;
   TORCH_CHECK(N == Nc * BLOCK_N, "DA8W4: weight and input shapes mismatch");
+  // scales/qzeros shape = [Nc, G, BLOCK_N]
   int64_t num_groups = weight_scales.size(1);
 
   const uint8_t* b_ptr = weight.data_ptr<uint8_t>();
   const float* b_scales_ptr = weight_scales.data_ptr<float>();
   const int8_t* b_qzeros_ptr = weight_qzeros.data_ptr<int8_t>();
-  const float* bias_ptr =
-      bias.has_value() ? bias.value().data_ptr<float>() : nullptr;
+  const float* bias_ptr = bias.has_value() ? bias.value().data_ptr<float>() : nullptr;
   int num_threads = at::get_num_threads();
-  int64_t temp_buffer_size = num_threads * BLOCK_M * BLOCK_N * sizeof(float) +
-                             num_threads * _block_k * BLOCK_N;
-  auto c_temp_buffer =
-      at::empty({temp_buffer_size}, input.options().dtype(at::kChar));
+  int64_t temp_buffer_size = /* output temp */ num_threads * BLOCK_M * BLOCK_N * sizeof(float) +
+                             /*  weight dequant temp */ num_threads * _block_k * BLOCK_N;
+  auto c_temp_buffer = at::empty({temp_buffer_size}, input.options().dtype(at::kChar));
   float* c_temp_ptr = (float*)((void*)(c_temp_buffer.data_ptr<int8_t>()));
-  int8_t* dqB_temp_ptr =
-      (int8_t*)((void*)(c_temp_ptr + num_threads * BLOCK_M * BLOCK_N));
-
-#define LAUNCH_DA8W4_LINEAR_WITH_QUANT_IMPL(sym_quant_act)                 \
-  AT_DISPATCH_FLOATING_TYPES_AND2(                                         \
-      at::ScalarType::BFloat16, at::ScalarType::Half, output_dtype,        \
-      "int4_scaled_mm_cpu", [&] {                                          \
-        const scalar_t* __restrict__ A_data = input.data_ptr<scalar_t>();  \
-        scalar_t* __restrict__ c_ptr = output.data_ptr<scalar_t>();        \
-        at::parallel_for(0, M_a, 0, [&](int64_t begin, int64_t end) {      \
-          for (int64_t m = begin; m < end; ++m) {                          \
-            quantize_row_int8<scalar_t>(Aq_data + m * K_a, As_data[m],     \
-                                        A_data + m * lda, K_a);            \
-          }                                                                \
-        });                                                                \
-        _da8w4_linear_impl<sym_quant_act, Tin, scalar_t>(                  \
-            Aq_data, As_data, Azp_data, b_ptr, b_scales_ptr, b_qzeros_ptr, \
-            bias_ptr, c_ptr, c_temp_ptr, dqB_temp_ptr, M_a, N, K_a,        \
-            num_groups);                                                   \
+  int8_t* dqB_temp_ptr = (int8_t*)((void*)(c_temp_ptr + num_threads * BLOCK_M * BLOCK_N));
+
+#define LAUNCH_DA8W4_LINEAR_WITH_QUANT_IMPL(sym_quant_act)                                                 \
+  AT_DISPATCH_FLOATING_TYPES_AND2(                                                                         \
+      at::ScalarType::BFloat16, at::ScalarType::Half, output_dtype, "int4_scaled_mm_cpu_with_quant", [&] { \
+        const scalar_t* __restrict__ A_data = input.data_ptr<scalar_t>();                                  \
+        scalar_t* __restrict__ c_ptr = output.data_ptr<scalar_t>();                                        \
+        at::parallel_for(0, M_a, 0, [&](int64_t begin, int64_t end) {                                      \
+          for (int64_t m = begin; m < end; ++m) {                                                          \
+            quantize_row_int8<scalar_t>(Aq_data + m * K_a, As_data[m], A_data + m * lda, K_a);             \
+          }                                                                                                \
+        });                                                                                                \
+        _da8w4_linear_impl<Tin, scalar_t, sym_quant_act>(                                                  \
+            Aq_data,                                                                                       \
+            As_data,                                                                                       \
+            Azp_data,                                                                                      \
+            b_ptr,                                                                                         \
+            b_scales_ptr,                                                                                  \
+            b_qzeros_ptr,                                                                                  \
+            bias_ptr,                                                                                      \
+            c_ptr,                                                                                         \
+            c_temp_ptr,                                                                                    \
+            dqB_temp_ptr,                                                                                  \
+            M_a,                                                                                           \
+            N,                                                                                             \
+            K_a,                                                                                           \
+            num_groups);                                                                                   \
       });
 
   LAUNCH_DA8W4_LINEAR_WITH_QUANT_IMPL(sym_quant_act);
 
   return output;
 }
-
-namespace {
-
 template <typename scalar_t>
-inline void copy_stub(scalar_t* __restrict__ out,
-                      const float* __restrict__ input, int64_t size) {
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
   using Vec = at::vec::Vectorized<scalar_t>;
   using fVec = at::vec::Vectorized<float>;
+// no remainder
 #pragma GCC unroll 4
   for (int64_t d = 0; d < size; d += Vec::size()) {
     fVec x0 = fVec::loadu(input + d);
@@ -716,40 +834,61 @@ inline void copy_stub(scalar_t* __restrict__ out,
   }
 }
 
-}  // anonymous namespace
-
 template <typename scalar_t>
-void tinygemm_kernel(scalar_t* C, float* C_temp, const uint8_t* A,
-                     const float* scales_a, const int32_t* qzeros_a,
-                     const uint8_t* B, const float* scales_b,
-                     const int8_t* qzeros_b, const int32_t* compensation,
-                     int8_t* dqB_tmp, int64_t M, int64_t K, int64_t lda,
-                     int64_t ldc_f, int64_t ldc_s, bool store_out,
-                     bool use_brgemm) {
-  _dequant_gemm_accum<false, BLOCK_N, BLOCK_N / 2>(
-      C_temp, A, scales_a, qzeros_a, B, scales_b, qzeros_b, compensation,
-      dqB_tmp, M, K, lda, ldc_f, use_brgemm);
+void tinygemm_kernel(
+    scalar_t* C,
+    float* C_temp,
+    const uint8_t* A,
+    const float* scales_a,
+    const int32_t* qzeros_a,
+    const uint8_t* B,
+    const float* scales_b,
+    const int8_t* qzeros_b,
+    const int32_t* compensation,
+    int8_t* dqB_tmp,
+    int64_t M,
+    int64_t K,
+    int64_t lda,
+    int64_t ldc_f,
+    int64_t ldc_s,
+    bool store_out,
+    bool use_brgemm) {
+  // TODO: add sym quant act, now only asym
+  _dequant_gemm_accum<BLOCK_N, BLOCK_N / 2, false>(
+      C_temp, A, scales_a, qzeros_a, B, scales_b, qzeros_b, compensation, dqB_tmp, M, K, lda, ldc_f, use_brgemm);
   if (store_out) {
+    // copy from Ctmp to C
     for (int64_t m = 0; m < M; ++m) {
       copy_stub<scalar_t>(C + m * ldc_s, C_temp + m * ldc_f, BLOCK_N);
     }
   }
 }
 
-#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                 \
-  template void tinygemm_kernel<TYPE>(                                      \
-      TYPE * C, float* C_temp, const uint8_t* A, const float* scales_a,     \
-      const int32_t* qzeros_a, const uint8_t* B, const float* scales_b,     \
-      const int8_t* qzeros_b, const int32_t* compensation, int8_t* dqB_tmp, \
-      int64_t M, int64_t K, int64_t lda, int64_t ldc_f, int64_t ldc_s,      \
-      bool store_out, bool use_brgemm)
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE) \
+  template void tinygemm_kernel<TYPE>(      \
+      TYPE * C,                             \
+      float* C_temp,                        \
+      const uint8_t* A,                     \
+      const float* scales_a,                \
+      const int32_t* qzeros_a,              \
+      const uint8_t* B,                     \
+      const float* scales_b,                \
+      const int8_t* qzeros_b,               \
+      const int32_t* compensation,          \
+      int8_t* dqB_tmp,                      \
+      int64_t M,                            \
+      int64_t K,                            \
+      int64_t lda,                          \
+      int64_t ldc_f,                        \
+      int64_t ldc_s,                        \
+      bool store_out,                       \
+      bool use_brgemm)
 
 INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
 INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
 
-at::Tensor int4_scaled_mm_cpu(at::Tensor& x, at::Tensor& w, at::Tensor& w_zeros,
-                              at::Tensor& w_scales,
-                              std::optional<at::Tensor> bias) {
-  return int4_scaled_mm_cpu_with_quant(x, w, w_scales, w_zeros, bias,
-                                       x.scalar_type());
+// int4 gemm dispatch api register
+at::Tensor int4_scaled_mm_cpu(
+    at::Tensor& x, at::Tensor& w, at::Tensor& w_zeros, at::Tensor& w_scales, std::optional<at::Tensor> bias) {
+  return int4_scaled_mm_cpu_with_quant(x, w, w_scales, w_zeros, bias, x.scalar_type());
 }
diff --git a/csrc/cpu/sgl-kernels/gemm_int8.cpp b/csrc/cpu/sgl-kernels/gemm_int8.cpp
index 4d6560cceb1a..c9b79de80105 100644
--- a/csrc/cpu/sgl-kernels/gemm_int8.cpp
+++ b/csrc/cpu/sgl-kernels/gemm_int8.cpp
@@ -1,20 +1,83 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
+// clang-format off
+
 #include "common.h"
-#include "vec.h"
 #include "gemm.h"
-
-// clang-format off
+#include "vec.h"
 
 namespace {
 
+template <typename scalar_t, bool has_bias, int BLOCK_N>
+struct scale_C {
+  static inline void apply(
+      scalar_t* __restrict__ C,
+      const int32_t* __restrict__ Ctmp,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      float As,
+      const float* __restrict__ Bs) {
+    TORCH_CHECK(false, "scale_C: scalar path not implemented!");
+  }
+};
+
+#if defined(CPU_CAPABILITY_AVX512)
+template <bool has_bias, int BLOCK_N>
+struct scale_C<at::BFloat16, has_bias, BLOCK_N> {
+  static inline void apply(
+      at::BFloat16* __restrict__ C,
+      const int32_t* __restrict__ Ctmp,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      float As,
+      const float* __restrict__ Bs) {
+    constexpr int COLS = BLOCK_N / 16;
+    static_assert(COLS % 2 == 0);
+
+    __m512 vc[COLS];
+    __m512 vd0 = _mm512_set1_ps(As);
+
+    auto compute = [&](auto col) {
+      __m512 vd1 = _mm512_loadu_ps(Bs + col * 16);
+      __m512i vcomp = _mm512_loadu_si512(Bcomp + col * 16);
+      __m512i vc32 = _mm512_loadu_si512(Ctmp + col * 16);
+      vc[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32, vcomp));
+      if constexpr (has_bias) {
+        __m512 vbias = _mm512_loadu_ps(bias + col * 16);
+        vc[col] = _mm512_fmadd_ps(_mm512_mul_ps(vc[col], vd0), vd1, vbias);
+      } else {
+        vc[col] = _mm512_mul_ps(_mm512_mul_ps(vc[col], vd0), vd1);
+      }
+    };
+    Unroll<COLS>{}(compute);
+
+    auto storec = [&](auto col) {
+      // for COLS = 2, 4 use 512bit store
+      if constexpr (col % 2 == 0) {
+        _mm512_storeu_si512(
+            reinterpret_cast<__m512i*>((C + col * 16)), (__m512i)(_mm512_cvtne2ps_pbh(vc[col + 1], vc[col + 0])));
+      }
+    };
+    Unroll<COLS>{}(storec);
+  }
+};
+#endif
+
 template <typename scalar_t, bool has_bias, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
-      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -23,10 +86,17 @@ struct tinygemm_kernel_nn {
 template <bool has_bias, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, at::BFloat16* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
-      const float* __restrict__ bias, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      const float* __restrict__ bias,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
     static_assert(COLS % 2 == 0);
@@ -38,10 +108,10 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
     __m512i vb[COLS];
     __m512i vc[ROWS * COLS];
     __m512i vcomp[COLS];
-    __m512  vd0;
-    __m512  vd1[COLS];
+    __m512 vd0;
+    __m512 vd1[COLS];
 
-    // oops! 4x4 spills but luckily we use 4x2
+    // oops! 4x4 spills but we use 4x2
     __m512 vbias[COLS];
 
     // [NOTE]: s8s8 igemm compensation in avx512-vnni
@@ -54,14 +124,12 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
     // 1) 128 * b is pre-computed when packing B to vnni formats
     // 2) a + 128 is fused when dynamically quantize A
     //
-    auto loadc = [&](auto i) {
-      vc[i] = _mm512_set1_epi32(0);
-    };
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_epi32(0); };
     Unroll<ROWS * COLS>{}(loadc);
 
     const int64_t K4 = K >> 2;
     const int64_t lda4 = lda >> 2;
-    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
     const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
     const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
 
@@ -89,7 +157,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
       constexpr int col = i % COLS;
 
       // load a scale
-      if constexpr(col == 0) {
+      if constexpr (col == 0) {
         vd0 = _mm512_set1_ps(As[row]);
       }
       // load b scale and vcomp per 2 vectors
@@ -120,8 +188,7 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
         }
 
         _mm512_storeu_si512(
-            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
-            (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
+            reinterpret_cast<__m512i*>((C + row * ldc + col * 16)), (__m512i)(_mm512_cvtne2ps_pbh(vc1, vc0)));
       }
     };
     Unroll<ROWS * COLS>{}(storec);
@@ -129,11 +196,19 @@ struct tinygemm_kernel_nn<at::BFloat16, has_bias, BLOCK_M, BLOCK_N> {
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
-    tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply(         \
-        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
-        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
-        has_bias ? bias + nb_start : nullptr, K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                \
+  tinygemm_kernel_nn<scalar_t, has_bias, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                          \
+      B + nb_start * 4,                                            \
+      C + mb_start * ldc + nb_start,                               \
+      As + mb_start,                                               \
+      Bs + nb_start,                                               \
+      Bcomp + nb_start,                                            \
+      has_bias ? bias + nb_start : nullptr,                        \
+      K,                                                           \
+      lda,                                                         \
+      ldb,                                                         \
+      ldc);
 
 template <typename scalar_t, bool has_bias>
 void tinygemm_kernel(
@@ -151,10 +226,20 @@ void tinygemm_kernel(
     int64_t ldb,
     int64_t ldc,
     bool brg) {
-
   // B compensation
   const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
 
+  if (brg) {
+    constexpr int BLOCK_N = block_size_n();
+    at::native::cpublas::brgemm(M, N, K, lda, ldb, BLOCK_N, /* add_C */ false, A, B, Ctmp);
+
+    // apply compensation and scale
+    for (int64_t m = 0; m < M; ++m) {
+      scale_C<scalar_t, has_bias, BLOCK_N>::apply(C + m * ldc, Ctmp + m * BLOCK_N, Bcomp, bias, As[m], Bs);
+    }
+    return;
+  }
+
   // pattern: 1-4-16
   constexpr int64_t BLOCK_M = 4;
   constexpr int64_t BLOCK_N = 64;
@@ -167,26 +252,43 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
+      switch (mb_size << 4 | nb_size >> 4) {
         // mb_size = 1
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
-        case 0x14: LAUNCH_TINYGEMM_KERNEL_NN(1, 64); break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
+        case 0x14:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 64);
+          break;
         // mb_size = 2
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
-        case 0x24: LAUNCH_TINYGEMM_KERNEL_NN(2, 64); break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
+        case 0x24:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 64);
+          break;
         // mb_size = 3
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
-        case 0x34: LAUNCH_TINYGEMM_KERNEL_NN(3, 64); break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
+        case 0x34:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 64);
+          break;
         // mb_size = 4
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        case 0x44: LAUNCH_TINYGEMM_KERNEL_NN(4, 64); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        case 0x44:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 64);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
 }
 
-template<typename scalar_t>
+template <typename scalar_t>
 void int8_scaled_mm_kernel_impl(
     scalar_t* __restrict__ out,
     const uint8_t* __restrict__ mat1,
@@ -197,28 +299,22 @@ void int8_scaled_mm_kernel_impl(
     int64_t M,
     int64_t N,
     int64_t K) {
-
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
   const int64_t MB = div_up(M, BLOCK_M);
   const int64_t NB = div_up(N, BLOCK_N);
 
-  // TODO: brgemm u8s8 depends on PyTorch 2.7 release.
-  const bool use_brgemm = false;
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
 
   // K + 4 after compensation
   const int64_t packed_row_size = get_row_size<int8_t>(K);
 
   AT_DISPATCH_BOOL(bias != nullptr, has_bias, [&] {
-    at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
-      int64_t mb{0}, nb{0};
-      data_index_init(begin, mb, MB, nb, NB);
-
+    parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
       // for brgemm, use int32_t for accumulate
       alignas(64) int32_t Ctmp[BLOCK_M * BLOCK_N];
 
-      for (int i = begin; i < end; ++i) {
-        UNUSED(i);
+      loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
         int mb_start = mb * BLOCK_M;
         int mb_size = std::min(M - mb_start, BLOCK_M);
         int nb_start = nb * BLOCK_N;
@@ -239,10 +335,7 @@ void int8_scaled_mm_kernel_impl(
             /* ldb */ nb_size,
             /* ldc */ N,
             /* brg */ use_brgemm);
-
-        // move to the next index
-        data_index_step(mb, MB, nb, NB);
-      }
+      });
 
       if (use_brgemm) {
         at::native::cpublas::brgemm_release();
@@ -251,28 +344,47 @@ void int8_scaled_mm_kernel_impl(
   });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // tinygemm interface
 template <typename scalar_t>
-void tinygemm_kernel(const uint8_t* __restrict__ A, const int8_t* __restrict__ B, scalar_t* __restrict__ C,
-    int32_t* __restrict__ Ctmp,  const float* __restrict__ As, const float* __restrict__ Bs,
-    int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg) {
+void tinygemm_kernel(
+    const uint8_t* __restrict__ A,
+    const int8_t* __restrict__ B,
+    scalar_t* __restrict__ C,
+    int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t lda,
+    int64_t ldb,
+    int64_t ldc,
+    bool brg) {
   tinygemm_kernel<scalar_t, false>(A, B, C, Ctmp, As, Bs, nullptr, M, N, K, lda, ldb, ldc, brg);
 }
 
-#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE)                                                     \
-    template void tinygemm_kernel<TYPE>(                                                        \
-        const uint8_t* __restrict__ A, const int8_t* __restrict__ B, TYPE* __restrict__ C,      \
-        int32_t* __restrict__ Ctmp, const float* __restrict__ As, const float* __restrict__ Bs, \
-        int64_t M, int64_t N, int64_t K, int64_t lda, int64_t ldb, int64_t ldc, bool brg)
+#define INSTANTIATE_TINYGEMM_TEMPLATE(TYPE) \
+  template void tinygemm_kernel<TYPE>(      \
+      const uint8_t* __restrict__ A,        \
+      const int8_t* __restrict__ B,         \
+      TYPE* __restrict__ C,                 \
+      int32_t* __restrict__ Ctmp,           \
+      const float* __restrict__ As,         \
+      const float* __restrict__ Bs,         \
+      int64_t M,                            \
+      int64_t N,                            \
+      int64_t K,                            \
+      int64_t lda,                          \
+      int64_t ldb,                          \
+      int64_t ldc,                          \
+      bool brg)
 
 INSTANTIATE_TINYGEMM_TEMPLATE(at::BFloat16);
 INSTANTIATE_TINYGEMM_TEMPLATE(at::Half);
 
 std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
-  RECORD_FUNCTION("sgl-kernel::per_token_quant_int8_cpu", std::vector<c10::IValue>({A}));
-
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(A);
   CHECK_DIM(2, A);
 
@@ -281,8 +393,7 @@ std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
   int64_t lda = A.stride(0);
 
   const auto st = A.scalar_type();
-  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
-      "per_token_quant_int8: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "per_token_quant_int8: expect A to be bfloat16 or half.");
 
   auto Aq = at::empty({M, K}, A.options().dtype(at::kByte));
   auto As = at::empty({M}, A.options().dtype(at::kFloat));
@@ -292,13 +403,9 @@ std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
     float* __restrict__ As_data = As.data_ptr<float>();
     const scalar_t* __restrict__ A_data = A.data_ptr<scalar_t>();
 
-    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+    at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
       for (int64_t m = begin; m < end; ++m) {
-        quantize_row_int8<scalar_t>(
-            Aq_data + m * K,
-            As_data[m],
-            A_data + m * lda,
-            K);
+        quantize_row_int8<scalar_t>(Aq_data + m * K, As_data[m], A_data + m * lda, K);
       }
     });
   });
@@ -315,11 +422,14 @@ std::tuple<at::Tensor, at::Tensor> per_token_quant_int8_cpu(at::Tensor& A) {
 // bias    : [N]
 // out     : [M, N]
 //
-at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
-    at::Tensor& scales1, at::Tensor& scales2,
-    std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
-  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales1, scales2, bias}));
-
+at::Tensor int8_scaled_mm_cpu(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales1,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
   auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
 
   CHECK_INPUT(mat1);
@@ -340,7 +450,8 @@ at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
 
   TORCH_CHECK(mat1.scalar_type() == at::kByte, "int8_scaled_mm: expect mat1 to be uint8.");
   TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm: expect mat2 to be int8.");
-  TORCH_CHECK(scales1.scalar_type() == at::kFloat && scales2.scalar_type() == at::kFloat,
+  TORCH_CHECK(
+      scales1.scalar_type() == at::kFloat && scales2.scalar_type() == at::kFloat,
       "int8_scaled_mm: expect scales to be float32.");
 
   auto out = at::empty({M, N}, mat1.options().dtype(out_dtype));
@@ -368,10 +479,13 @@ at::Tensor int8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
 }
 
 // fused `per_token_quant_int8_cpu` and `int8_scaled_mm_cpu`
-at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Tensor& scales2,
-    const std::optional<at::Tensor>& bias, at::ScalarType out_dtype, bool is_vnni) {
-  RECORD_FUNCTION("sgl-kernel::int8_scaled_mm_cpu", std::vector<c10::IValue>({mat1, mat2, scales2, bias}));
-
+at::Tensor int8_scaled_mm_with_quant(
+    at::Tensor& mat1,
+    at::Tensor& mat2,
+    at::Tensor& scales2,
+    const std::optional<at::Tensor>& bias,
+    at::ScalarType out_dtype,
+    bool is_vnni) {
   auto packed_w = is_vnni ? mat2 : convert_weight_packed(mat2);
 
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(mat1);
@@ -390,14 +504,10 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Ten
   CHECK_EQ(scales2.numel(), N);
 
   const auto st = mat1.scalar_type();
-  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf,
-      "int8_scaled_mm_with_quant: expect A to be bfloat16 or half.");
-  TORCH_CHECK(st == out_dtype,
-      "int8_scaled_mm_with_quant: expect A has same dtype with out_dtype.");
-  TORCH_CHECK(mat2.scalar_type() == at::kChar,
-      "int8_scaled_mm_with_quant: expect mat2 to be int8.");
-  TORCH_CHECK(scales2.scalar_type() == at::kFloat,
-      "int8_scaled_mm_with_quant: expect scales to be float32.");
+  TORCH_CHECK(st == at::kBFloat16 || st == at::kHalf, "int8_scaled_mm_with_quant: expect A to be bfloat16 or half.");
+  TORCH_CHECK(st == out_dtype, "int8_scaled_mm_with_quant: expect A has same dtype with out_dtype.");
+  TORCH_CHECK(mat2.scalar_type() == at::kChar, "int8_scaled_mm_with_quant: expect mat2 to be int8.");
+  TORCH_CHECK(scales2.scalar_type() == at::kFloat, "int8_scaled_mm_with_quant: expect scales to be float32.");
 
   const int64_t buffer_size = M * K + M * sizeof(float);
   auto buffer = at::empty({buffer_size}, mat1.options().dtype(at::kByte));
@@ -415,13 +525,9 @@ at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2, at::Ten
     float* __restrict__ As_data = (float*)((void*)(Aq_data + M * K));
     const scalar_t* __restrict__ A_data = mat1.data_ptr<scalar_t>();
 
-    at::parallel_for(0, M, 0, [&] (int64_t begin, int64_t end) {
+    at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
       for (int64_t m = begin; m < end; ++m) {
-        quantize_row_int8<scalar_t>(
-            Aq_data + m * K,
-            As_data[m],
-            A_data + m * lda,
-            K);
+        quantize_row_int8<scalar_t>(Aq_data + m * K, As_data[m], A_data + m * lda, K);
       }
     });
 
diff --git a/csrc/cpu/sgl-kernels/moe.cpp b/csrc/cpu/sgl-kernels/moe.cpp
index c01bfd376d4f..c248f678beee 100644
--- a/csrc/cpu/sgl-kernels/moe.cpp
+++ b/csrc/cpu/sgl-kernels/moe.cpp
@@ -1,12 +1,13 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
+// clang-format off
+
+#include "moe.h"
+
 #include "common.h"
-#include "vec.h"
 #include "gemm.h"
 
-// clang-format off
-
 namespace {
 
 // [NOTE]: Fused MoE kernel with AMX
@@ -30,109 +31,6 @@ namespace {
 //     3. abstract at::native::cpublas::brgemm with WoQ gemm (M = 1 & M != 1)
 //
 
-template <typename scalar_t>
-inline void fill_stub(scalar_t* __restrict__ out, scalar_t val, int64_t size) {
-  using Vec = at::vec::Vectorized<scalar_t>;
-  const Vec data_vec(val);
-  at::vec::map<scalar_t>([data_vec](Vec out) { return out = data_vec; }, out, out, size);
-}
-
-template <typename scalar_t>
-inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
-  using Vec = at::vec::Vectorized<scalar_t>;
-  // no remainder
-  #pragma GCC unroll 4
-  for (int64_t d = 0; d < size; d += Vec::size()) {
-    Vec data = Vec::loadu(input + d);
-    data.store(out + d);
-  }
-}
-
-template <typename scalar_t>
-inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec weight_vec = fVec(weight);
-  int64_t d;
-  #pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    fVec data0 = fVec::loadu(input + d) * weight_vec;
-    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] * weight);
-  }
-}
-
-// acc from [topk, K] to [K]
-template <typename scalar_t>
-inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  if (topk == 1) {
-    // do copy for topk = 1
-    copy_stub(out, input, K);
-  } else {
-    // do sum for topk != 1
-    int64_t d;
-    #pragma GCC unroll 4
-    for (d = 0; d <= K - kVecSize; d += kVecSize) {
-      fVec sum_fvec0 = fVec(0.f);
-      fVec sum_fvec1 = fVec(0.f);
-      for (int t = 0; t < topk; ++t) {
-        bVec x_bvec = bVec::loadu(input + t * K + d);
-        fVec x_fvec0, x_fvec1;
-        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
-
-        sum_fvec0 += x_fvec0;
-        sum_fvec1 += x_fvec1;
-      }
-      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
-      out_bvec.store(out + d);
-    }
-    for (; d < K; ++d) {
-      float sum_val = 0.f;
-      for (int t = 0; t < topk; ++t) {
-        sum_val += static_cast<float>(input[t * K + d]);
-      }
-      out[d] = static_cast<scalar_t>(sum_val);
-    }
-  }
-}
-
-// out = input + input2 * scale
-template <typename scalar_t>
-inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
-    const scalar_t* __restrict__ input2, float scale, int64_t size) {
-
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec s_vec = fVec(scale);
-  int64_t d;
-  #pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    fVec x0 = fVec::loadu(input + d);
-    fVec x1 = fVec::loadu(input + d + fVec::size());
-
-    bVec y_bvec = bVec::loadu(input2 + d);
-    fVec y0, y1;
-    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
-
-    x0 = x0 + y0 * s_vec;
-    x1 = x1 + y1 * s_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
-  }
-}
-
 template <int BLOCK_M>
 int moe_align_block_size(
     int32_t* __restrict__ sorted_ids,
@@ -144,8 +42,7 @@ int moe_align_block_size(
     int num_experts,
     int numel,
     int num_threads) {
-
-  #define T_INDEX(tt) total_cnts + (tt) * num_experts
+#define T_INDEX(tt) total_cnts + (tt) * num_experts
 
   // accumulate count of expert ids locally
   at::parallel_for(0, numel, 0, [&](int begin, int end) {
@@ -160,8 +57,7 @@ int moe_align_block_size(
   using iVec = at::vec::Vectorized<int32_t>;
   for (int t = 0; t < num_threads; ++t) {
     at::vec::map2<int32_t>(
-        [](iVec x, iVec y) { return x + y; },
-        T_INDEX(t + 1), T_INDEX(t + 1), T_INDEX(t), num_experts);
+        [](iVec x, iVec y) { return x + y; }, T_INDEX(t + 1), T_INDEX(t + 1), T_INDEX(t), num_experts);
   }
 
   // the last row holds sums of each experts
@@ -201,7 +97,9 @@ int moe_align_block_size(
   // padding value for sorted_ids: numel
   auto sorted_id_size = [=](const int32_t* sorted_ids_ptr) {
     for (int d = 0; d < BLOCK_M; ++d) {
-      if (sorted_ids_ptr[d] == numel) { return d; }
+      if (sorted_ids_ptr[d] == numel) {
+        return d;
+      }
     }
     return BLOCK_M;
   };
@@ -215,7 +113,7 @@ int moe_align_block_size(
       offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
     }
   });
-  // TODO: do we need to vectorize this ?
+  // TODO: do we need to vecterize this ?
   for (int mb = 0; mb < num_token_blocks; ++mb) {
     offsets[mb + 1] += offsets[mb];
   }
@@ -236,7 +134,6 @@ inline void silu_and_mul(
     const float* __restrict__ input1,  // y: y0, y1
     int64_t m_size,
     int64_t N) {
-
   using bVec = at::vec::Vectorized<scalar_t>;
   using fVec = at::vec::Vectorized<float>;
 
@@ -269,8 +166,14 @@ inline void silu_and_mul(
 template <typename scalar_t, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn2 {
   static inline void apply(
-      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B0, const scalar_t* __restrict__ B1,
-      scalar_t* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B0,
+      const scalar_t* __restrict__ B1,
+      scalar_t* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -279,9 +182,14 @@ struct tinygemm_kernel_nn2 {
 template <int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B0, const at::BFloat16* __restrict__ B1,
-      at::BFloat16* __restrict__ C, int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B0,
+      const at::BFloat16* __restrict__ B1,
+      at::BFloat16* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
 
@@ -304,7 +212,7 @@ struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
 
     const int64_t K2 = K >> 1;
     const int64_t lda2 = lda >> 1;
-    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
     const float* a_ptr = reinterpret_cast<const float*>(A);
     const float* b0_ptr = reinterpret_cast<const float*>(B0);
     const float* b1_ptr = reinterpret_cast<const float*>(B1);
@@ -352,17 +260,16 @@ struct tinygemm_kernel_nn2<at::BFloat16, BLOCK_M, BLOCK_N> {
         _mm512_storeu_si512(
             reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
             (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
-        }
+      }
     };
     Unroll<ROWS * COLS>{}(storec);
   }
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)                          \
-    tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply(                  \
-        A + mb_start * lda, B0 + nb_start * 2, B1 + nb_start * 2,            \
-        C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_NN(MB_SIZE, NB_SIZE)       \
+  tinygemm_kernel_nn2<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B0 + nb_start * 2, B1 + nb_start * 2, C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
 
 template <typename scalar_t>
 void tinygemm_kernel(
@@ -376,7 +283,6 @@ void tinygemm_kernel(
     int64_t lda,
     int64_t ldb,
     int64_t ldc) {
-
   // pattern: 1-(2+2)-(8+8)
   constexpr int64_t BLOCK_M = 4;
   constexpr int64_t BLOCK_N = 32;
@@ -389,16 +295,25 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
+      switch (mb_size << 4 | nb_size >> 4) {
         // mb_size = 1
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN(1, 32); break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN(1, 32);
+          break;
         // mb_size = 2
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN(2, 32); break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN(2, 32);
+          break;
         // mb_size = 3
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN(3, 32); break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN(3, 32);
+          break;
         // mb_size = 4
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
@@ -407,8 +322,13 @@ void tinygemm_kernel(
 template <typename scalar_t, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn {
   static inline void apply(
-      const scalar_t* __restrict__ A, const scalar_t* __restrict__ B, float* __restrict__ C,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const scalar_t* __restrict__ A,
+      const scalar_t* __restrict__ B,
+      float* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -417,9 +337,13 @@ struct tinygemm_kernel_nn {
 template <int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const at::BFloat16* __restrict__ A, const at::BFloat16* __restrict__ B, float* __restrict__ C,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const at::BFloat16* __restrict__ A,
+      const at::BFloat16* __restrict__ B,
+      float* __restrict__ C,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
 
@@ -432,14 +356,12 @@ struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
     __m512bh vb[COLS];
     __m512 vc[ROWS * COLS];
 
-    auto loadc = [&](auto i) {
-      vc[i] = _mm512_set1_ps(0.f);
-    };
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_ps(0.f); };
     Unroll<ROWS * COLS>{}(loadc);
 
     const int64_t K2 = K >> 1;
     const int64_t lda2 = lda >> 1;
-    const int64_t ldb2 = ldb; // ldb * 2 >> 1;
+    const int64_t ldb2 = ldb;  // ldb * 2 >> 1;
     const float* a_ptr = reinterpret_cast<const float*>(A);
     const float* b_ptr = reinterpret_cast<const float*>(B);
 
@@ -466,17 +388,15 @@ struct tinygemm_kernel_nn<at::BFloat16, BLOCK_M, BLOCK_N> {
       constexpr int row = i / COLS;
       constexpr int col = i % COLS;
       _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), vc[i]);
-
     };
     Unroll<ROWS * COLS>{}(storec);
   }
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)                         \
-    tinygemm_kernel_nn<scalar_t, MB_SIZE, NB_SIZE>::apply(                   \
-        A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, \
-        K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_NN2(MB_SIZE, NB_SIZE)     \
+  tinygemm_kernel_nn<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda, B + nb_start * 2, C + mb_start * ldc + nb_start, K, lda, ldb, ldc);
 
 template <typename scalar_t>
 void tinygemm_kernel(
@@ -489,7 +409,6 @@ void tinygemm_kernel(
     int64_t lda,
     int64_t ldb,
     int64_t ldc) {
-
   // pattern: 1-2-8
   constexpr int64_t BLOCK_M = 4;
   constexpr int64_t BLOCK_N = 32;
@@ -502,16 +421,25 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
+      switch (mb_size << 4 | nb_size >> 4) {
         // mb_size = 1
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_NN2(1, 32); break;
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_NN2(1, 32);
+          break;
         // mb_size = 2
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_NN2(2, 32); break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_NN2(2, 32);
+          break;
         // mb_size = 3
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_NN2(3, 32); break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_NN2(3, 32);
+          break;
         // mb_size = 4
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_NN2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_NN2(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
@@ -537,7 +465,6 @@ void fused_experts_kernel_impl(
     int64_t E,
     int64_t topk,
     int64_t num_tokens_post_pad) {
-
   // handle 2 tiles per block
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
@@ -552,39 +479,36 @@ void fused_experts_kernel_impl(
   const int64_t stride_e = 2 * N * K;
   const int64_t stride_n = K;
 
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<scalar_t>(avg_M);
+
   // here we only parallel on half of 2N to fuse silu_and_mul with gemm
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
     float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
     float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
-
-      // nb0 from top half and nb1 from bottom half
-      int64_t nb0 = nb, nb1 = nb + NB;
-      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
 
       // B shape [K, n_size] in vnni format
       int32_t expert_id = expert_ids[mb];
-      const scalar_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
-      const scalar_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb_upper * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb_lower * BLOCK_N * stride_n;
 
-      // 1.a load A
-      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
       int64_t m_size = offsets[mb + 1] - offsets[mb];
 
-      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
-
-      for (int64_t m = 0; m < m_size; ++m) {
-        int32_t index = A_ids[m] / topk;
-        copy_stub(A + m * K, input + index * K, K);
+      if (nb_offset == 0) {
+        // 1.a load A
+        const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+        for (int64_t m = 0; m < m_size; ++m) {
+          int32_t index = A_ids[m] / topk;
+          copy_stub(A + m * K, input + index * K, K);
+        }
       }
 
       if (use_brgemm) {
@@ -616,12 +540,7 @@ void fused_experts_kernel_impl(
 
         // 1.d silu and mul
         const int64_t offset = offsets[mb];
-        silu_and_mul<scalar_t, BLOCK_N>(
-            ic1 + offset * N + nb * BLOCK_N,
-            C0,
-            C1,
-            m_size,
-            N);
+        silu_and_mul<scalar_t, BLOCK_N>(ic1 + offset * N + nb * BLOCK_N, C0, C1, m_size, N);
       } else {
         // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
         const int64_t offset = offsets[mb];
@@ -637,9 +556,9 @@ void fused_experts_kernel_impl(
             /* ldb   */ n_size,
             /* ldc   */ N);
       }
-    }
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
@@ -654,24 +573,16 @@ void fused_experts_kernel_impl(
   const int64_t stride_oc = IC;
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     // we won't be using C1 for gemm2
     float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
-
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = offsets[mb + 1] - offsets[mb];
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
-      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
-
       // A ptr from ic1 of [M * topk, N] in sorted order
       // so as to avoid copy A to tmp buffer again
       const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
@@ -714,9 +625,9 @@ void fused_experts_kernel_impl(
         float weight = topk_weights[index];
         copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
       }
-    }
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
@@ -743,7 +654,6 @@ void shared_expert_kernel_impl(
     int64_t M,
     int64_t N,
     int64_t K) {
-
   // handle 2 tiles per block
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
@@ -755,36 +665,29 @@ void shared_expert_kernel_impl(
   TORCH_CHECK(N % BLOCK_N == 0, "Fixme when N is not multiples of ", BLOCK_N);
   const int64_t stride_n = K;
 
+  const bool use_brgemm = can_use_brgemm<scalar_t>(M);
+
+  const bool apply_scaling_factor = fused_experts_out != nullptr;
+
   // here we only parallel on half of 2N to fuse silu_and_mul with gemm
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
     float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
-
-      // nb0 from top half and nb1 from bottom half
-      int64_t nb0 = nb, nb1 = nb + NB;
-      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
 
-      //int64_t mb_start = mb * BLOCK_M;
-      //int64_t mb_size = std::min(M - mb_start, BLOCK_M);
-
       // A shape [m_size, K]
       const scalar_t* A = input + mb * BLOCK_M * K;
 
       // B shape [K, n_size] in vnni format
-      const scalar_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
-      const scalar_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
-
-      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
+      const scalar_t* __restrict__ B0 = packed_w1 + nb_upper * BLOCK_N * stride_n;
+      const scalar_t* __restrict__ B1 = packed_w1 + nb_lower * BLOCK_N * stride_n;
 
       if (use_brgemm) {
         // 1.b gemm: C0 = A @ B0
@@ -814,12 +717,7 @@ void shared_expert_kernel_impl(
             /* C     */ C1);
 
         // 1.d silu and mul
-        silu_and_mul<scalar_t, BLOCK_N>(
-            ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
-            C0,
-            C1,
-            m_size,
-            N);
+        silu_and_mul<scalar_t, BLOCK_N>(ic1 + mb * BLOCK_M * N + nb * BLOCK_N, C0, C1, m_size, N);
       } else {
         // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
         tinygemm_kernel(
@@ -834,9 +732,9 @@ void shared_expert_kernel_impl(
             /* ldb   */ n_size,
             /* ldc   */ N);
       }
-    }
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
@@ -850,24 +748,16 @@ void shared_expert_kernel_impl(
   const int64_t stride_oc = IC;
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     // we won't be using C1 for gemm2
     float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
-
+    loop_2d<scalar_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
-      const bool use_brgemm = can_use_brgemm<scalar_t>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
-
       // A shape [m_size, IC]
       const scalar_t* __restrict__ A = ic1 + mb * BLOCK_M * N;
 
@@ -902,19 +792,21 @@ void shared_expert_kernel_impl(
 
       // 2.b copy from C to output and add fused_experts_out
       scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
-      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out =
+          apply_scaling_factor ? fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N : nullptr;
       for (int64_t m = 0; m < m_size; ++m) {
-        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+        const scalar_t* __restrict__ fused_out_row = apply_scaling_factor ? (fused_out + m * K) : nullptr;
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out_row, routed_scaling_factor, n_size);
       }
-    }
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 // common checks
 static inline void check_moe_scales(
@@ -922,14 +814,10 @@ static inline void check_moe_scales(
     bool use_fp8_w8a16,
     const std::optional<at::Tensor>& w1_scale,
     const std::optional<at::Tensor>& w2_scale,
-    const std::optional<std::vector<int64_t>> block_size,
-    const std::optional<at::Tensor>& a1_scale,
-    const std::optional<at::Tensor>& a2_scale) {
+    const std::optional<std::vector<int64_t>> block_size) {
   if (use_int8_w8a8) {
     TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for int8 w8a8.");
     TORCH_CHECK(w2_scale.has_value(), "missing w2_scale for int8 w8a8.");
-    TORCH_CHECK(!a1_scale.has_value(), "static quantization for activation not supported.");
-    TORCH_CHECK(!a2_scale.has_value(), "static quantization for activation not supported.");
   }
   if (use_fp8_w8a16) {
     TORCH_CHECK(w1_scale.has_value(), "missing w1_scale for fp8 w8a16.");
@@ -939,16 +827,16 @@ static inline void check_moe_scales(
   }
 }
 
-#define CHECK_MOE_SCALES_FP8(DIM0, DIM1)                 \
-    auto w1s = w1_scale.value();                         \
-    auto w2s = w2_scale.value();                         \
-    auto block_size_val = block_size.value();            \
-    int64_t block_size_N = block_size_val[0];            \
-    int64_t block_size_K = block_size_val[1];            \
-    TORCH_CHECK(w1s.size(DIM0) == 2 * N / block_size_N); \
-    TORCH_CHECK(w1s.size(DIM1) == K / block_size_K);     \
-    TORCH_CHECK(w2s.size(DIM0) == K / block_size_N);     \
-    TORCH_CHECK(w2s.size(DIM1) == N / block_size_K)
+#define CHECK_MOE_SCALES_FP8(DIM0, DIM1)                      \
+  auto w1s = w1_scale.value();                                \
+  auto w2s = w2_scale.value();                                \
+  auto block_size_val = block_size.value();                   \
+  int64_t block_size_N = block_size_val[0];                   \
+  int64_t block_size_K = block_size_val[1];                   \
+  TORCH_CHECK(w1s.size(DIM0) == div_up(2 * N, block_size_N)); \
+  TORCH_CHECK(w1s.size(DIM1) == div_up(K, block_size_K));     \
+  TORCH_CHECK(w2s.size(DIM0) == div_up(K, block_size_N));     \
+  TORCH_CHECK(w2s.size(DIM1) == div_up(N, block_size_K))
 
 // hidden_states: [M, K]
 // w1: [E, 2N, K]
@@ -956,6 +844,7 @@ static inline void check_moe_scales(
 // topk_weights: [M, topk]
 // topk_ids: [M, topk] (int32_t)
 //
+
 at::Tensor fused_experts_cpu(
     at::Tensor& hidden_states,
     at::Tensor& w1,
@@ -963,16 +852,13 @@ at::Tensor fused_experts_cpu(
     at::Tensor& topk_weights,
     at::Tensor& topk_ids,
     bool inplace,
-    bool use_int8_w8a8,
-    bool use_fp8_w8a16,
+    int64_t moe_comp_method,
     const std::optional<at::Tensor>& w1_scale,
     const std::optional<at::Tensor>& w2_scale,
+    const std::optional<at::Tensor>& w1_zero,
+    const std::optional<at::Tensor>& w2_zero,
     const std::optional<std::vector<int64_t>> block_size,
-    const std::optional<at::Tensor>& a1_scale,
-    const std::optional<at::Tensor>& a2_scale,
     bool is_vnni) {
-  RECORD_FUNCTION("sgl-kernel::fused_experts_cpu", std::vector<c10::IValue>({hidden_states, w1, w2, topk_weights, topk_ids}));
-
   auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
   auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
 
@@ -985,32 +871,49 @@ at::Tensor fused_experts_cpu(
   CHECK_INPUT(w2);
   CHECK_EQ(topk_weights.sizes(), topk_ids.sizes());
   CHECK_DIM(2, hidden_states);
-  CHECK_DIM(3, w1);
-  CHECK_DIM(3, w2);
+  if (moe_comp_method == CPUQuantMethod::INT4_W4A8 && is_vnni) {
+    CHECK_DIM(4, w1);
+    CHECK_DIM(4, w2);
+  } else {
+    CHECK_DIM(3, w1);
+    CHECK_DIM(3, w2);
+  }
   CHECK_DIM(2, topk_weights);
   CHECK_DIM(2, topk_ids);
 
   CHECK_EQ(topk_ids.scalar_type(), at::kInt);
-  CHECK_EQ(topk_weights.scalar_type(), at::kFloat);
+
+  // TODO: support topk_weights to be bf16 or fp16 in the kernel.
+  // The topk_weights of llama4 is computed via Llama4MoE:custom_routing_function and is bf16/fp16
+  // while the kernel currently only supports it to be float32
+  auto topk_weights_ = topk_weights.to(at::kFloat);
+  CHECK_EQ(topk_weights_.scalar_type(), at::kFloat);
 
   int64_t M = hidden_states.size(0);
   int64_t K = hidden_states.size(1);
-  int64_t N = w1.size(1) / 2;
+  int64_t N = moe_comp_method == CPUQuantMethod::INT4_W4A8 ? w1_scale.value().size(1) * w1_scale.value().size(3) / 2
+                                                           : w1.size(1) / 2;
   int64_t E = w1.size(0);
-  int64_t topk = topk_weights.size(1);
+  int64_t topk = topk_weights_.size(1);
 
   // we use int32_t compensation for int8 w8a8
-  int64_t packed_K = get_row_size(K, use_int8_w8a8);
-  int64_t packed_N = get_row_size(N, use_int8_w8a8);
+  int64_t packed_K = get_row_size(K, moe_comp_method == CPUQuantMethod::INT8_W8A8);
+  int64_t packed_N = get_row_size(N, moe_comp_method == CPUQuantMethod::INT8_W8A8);
 
   // check weight shapes
   CHECK_EQ(w2.size(0), E);
-  CHECK_EQ(w2.size(1), K);
-  CHECK_EQ(packed_w1.size(2), packed_K);
-  CHECK_EQ(packed_w2.size(2), packed_N);
-
+  if (!(moe_comp_method == CPUQuantMethod::INT4_W4A8)) {
+    CHECK_EQ(w2.size(1), K);
+    CHECK_EQ(packed_w1.size(2), packed_K / (moe_comp_method == CPUQuantMethod::INT4_W4A8 ? 2 : 1));
+    CHECK_EQ(packed_w2.size(2), packed_N / (moe_comp_method == CPUQuantMethod::INT4_W4A8 ? 2 : 1));
+  }
   // check scales
-  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+  check_moe_scales(
+      moe_comp_method == CPUQuantMethod::INT8_W8A8,
+      moe_comp_method == CPUQuantMethod::FP8_W8A16,
+      w1_scale,
+      w2_scale,
+      block_size);
 
   at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
 
@@ -1031,8 +934,8 @@ at::Tensor fused_experts_cpu(
   int32_t* __restrict__ sorted_ids = buffer.data_ptr<int32_t>();
   int32_t* __restrict__ expert_ids = sorted_ids + max_num_tokens_padded;
   int32_t* __restrict__ total_cnts = expert_ids + max_num_blocks;
-  int32_t* __restrict__ cumsums    = total_cnts + (num_threads + 1) * E;
-  int32_t* __restrict__ offsets    = cumsums    + (E + 1);
+  int32_t* __restrict__ cumsums = total_cnts + (num_threads + 1) * E;
+  int32_t* __restrict__ offsets = cumsums + (E + 1);
 
   // init sorted_ids with `numel` as the padding number
   // init expert_ids with `num_experts`
@@ -1064,26 +967,31 @@ at::Tensor fused_experts_cpu(
   //
   // for fp8 w8a16:
   //   7. intermediate_cache0 : [M * topk, 2N]
-  //   8. B_tmp : [T, BLOCK_N, std::max(K, N)]
+  //   8. B_tmp : [T, MAX_CACHE_BLOCK_SIZE, BLOCK_N, std::max(K, N)]
   //
-  int64_t buffer_size_nbytes = M * topk * N * 2 + M * topk * K * 2 +
-      num_threads * BLOCK_M * K * (use_int8_w8a8 ? 1 : 2) +
+  int64_t buffer_size_nbytes =
+      M * topk * N * 2 + M * topk * K * 2 +
+      num_threads * BLOCK_M * K *
+          (moe_comp_method == CPUQuantMethod::INT8_W8A8 | moe_comp_method == CPUQuantMethod::INT4_W4A8 ? 1 : 2) +
       num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
 
-  if (use_int8_w8a8) {
+  if (moe_comp_method == CPUQuantMethod::INT8_W8A8) {
     buffer_size_nbytes += std::max(M * K, M * topk * N) + M * topk * sizeof(float);
   }
-  if (use_fp8_w8a16) {
-    buffer_size_nbytes += M * topk * 2 * N * 2 + num_threads * BLOCK_N * std::max(K, N) * 2;
+  if (moe_comp_method == CPUQuantMethod::FP8_W8A16) {
+    buffer_size_nbytes += M * topk * 2 * N * 2 + num_threads * MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N) * 2;
+  }
+  if (moe_comp_method == CPUQuantMethod::INT4_W4A8) {
+    buffer_size_nbytes += M * topk * 2 * N * 2 + std::max(M * K, M * topk * N) + M * topk * sizeof(float) +
+                          num_threads * 2 * get_4bit_block_k_size(K / w1_scale.value().size(2)) * BLOCK_N;
   }
-
   auto buffer2 = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
 
   AT_DISPATCH_REDUCED_FLOATING_TYPES(st, "fused_experts_kernel_impl", [&] {
     scalar_t* __restrict__ intermediate_cache1 = (scalar_t*)((void*)(buffer2.data_ptr<int8_t>()));
     scalar_t* __restrict__ intermediate_cache2 = intermediate_cache1 + M * topk * N;
 
-    if (use_int8_w8a8) {
+    if (moe_comp_method == CPUQuantMethod::INT8_W8A8) {
       uint8_t* __restrict__ A_tmp = (uint8_t*)((void*)(intermediate_cache2 + M * topk * K));
       float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
       uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
@@ -1107,7 +1015,7 @@ at::Tensor fused_experts_cpu(
           packed_w2.data_ptr<int8_t>(),
           w1s.data_ptr<float>(),
           w2s.data_ptr<float>(),
-          topk_weights.data_ptr<float>(),
+          topk_weights_.data_ptr<float>(),
           sorted_ids,
           expert_ids,
           offsets,
@@ -1117,7 +1025,7 @@ at::Tensor fused_experts_cpu(
           E,
           topk,
           num_tokens_post_pad);
-    } else if (use_fp8_w8a16) {
+    } else if (moe_comp_method == CPUQuantMethod::FP8_W8A16) {
       // here we just ignore C_tmp as it is not used
       scalar_t* __restrict__ A_tmp = (scalar_t*)((void*)(intermediate_cache2 + M * topk * K));
       float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
@@ -1140,6 +1048,48 @@ at::Tensor fused_experts_cpu(
           w2s.data_ptr<float>(),
           block_size_N,
           block_size_K,
+          topk_weights_.data_ptr<float>(),
+          sorted_ids,
+          expert_ids,
+          offsets,
+          M,
+          N,
+          K,
+          E,
+          topk,
+          num_tokens_post_pad);
+    } else if (moe_comp_method == CPUQuantMethod::INT4_W4A8) {
+      uint8_t* __restrict__ A_tmp = (uint8_t*)((void*)(intermediate_cache2 + M * topk * K));
+      float* __restrict__ C_tmp = (float*)((void*)(A_tmp + num_threads * BLOCK_M * K));
+      scalar_t* __restrict__ intermediate_cache0 = (scalar_t*)((void*)(C_tmp + num_threads * 2 * BLOCK_M * BLOCK_N));
+      uint8_t* __restrict__ Aq_tmp = (uint8_t*)((void*)(intermediate_cache0 + M * topk * 2 * N));
+      float* __restrict__ As_tmp = (float*)((void*)(Aq_tmp + std::max(M * K, M * topk * N)));
+      int8_t* __restrict__ dqB_tmp = (int8_t*)((void*)(As_tmp + M * topk));
+
+      // weight + compensation shape = [Nc, Kc, block_n * block_k / 2 + block_n*sizeof(int32_t)]
+      // scales/qzeros shape = [E, Nc, G, block_n]
+      int64_t num_groups = w1_scale.value().size(2);
+      const int group_size = K / num_groups;
+      // TODO: check scales and zeros
+      fused_experts_int4_w4a8_kernel_impl<scalar_t>(
+          out_hidden_states.data_ptr<scalar_t>(),
+          intermediate_cache0,
+          intermediate_cache1,
+          intermediate_cache2,
+          A_tmp,
+          Aq_tmp,
+          As_tmp,
+          nullptr,
+          C_tmp,
+          dqB_tmp,
+          hidden_states.data_ptr<scalar_t>(),
+          packed_w1.data_ptr<uint8_t>(),
+          packed_w2.data_ptr<uint8_t>(),
+          w1_zero.value().data_ptr<int8_t>(),
+          w2_zero.value().data_ptr<int8_t>(),
+          w1_scale.value().data_ptr<float>(),
+          w2_scale.value().data_ptr<float>(),
+          group_size,
           topk_weights.data_ptr<float>(),
           sorted_ids,
           expert_ids,
@@ -1163,7 +1113,7 @@ at::Tensor fused_experts_cpu(
           hidden_states.data_ptr<scalar_t>(),
           packed_w1.data_ptr<scalar_t>(),
           packed_w2.data_ptr<scalar_t>(),
-          topk_weights.data_ptr<float>(),
+          topk_weights_.data_ptr<float>(),
           sorted_ids,
           expert_ids,
           offsets,
@@ -1188,34 +1138,37 @@ at::Tensor shared_expert_cpu(
     at::Tensor& hidden_states,
     at::Tensor& w1,
     at::Tensor& w2,
-    at::Tensor& fused_experts_out,
-    double routed_scaling_factor,
+    const std::optional<at::Tensor>& fused_experts_out,
+    const std::optional<double> routed_scaling_factor,
     bool inplace,
     bool use_int8_w8a8,
     bool use_fp8_w8a16,
-    std::optional<at::Tensor>& w1_scale,
-    std::optional<at::Tensor>& w2_scale,
-    std::optional<std::vector<int64_t>> block_size,
-    std::optional<at::Tensor>& a1_scale,
-    std::optional<at::Tensor>& a2_scale,
+    const std::optional<at::Tensor>& w1_scale,
+    const std::optional<at::Tensor>& w2_scale,
+    const std::optional<std::vector<int64_t>> block_size,
     bool is_vnni) {
-  RECORD_FUNCTION("sgl-kernel::shared_expert_cpu", std::vector<c10::IValue>({hidden_states, w1, w2}));
-
   auto packed_w1 = is_vnni ? w1 : convert_weight_packed(w1);
   auto packed_w2 = is_vnni ? w2 : convert_weight_packed(w2);
 
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
 
+  double routed_scaling_factor_value = 0;
+  if (routed_scaling_factor.has_value()) {
+    TORCH_CHECK(fused_experts_out.has_value(), "shared_expert_cpu: expect fused_experts_out.");
+    const auto fused_experts_out_tensor = fused_experts_out.value();
+    routed_scaling_factor_value = routed_scaling_factor.value();
+    CHECK_INPUT(fused_experts_out_tensor);
+    CHECK_EQ(hidden_states.sizes(), fused_experts_out_tensor.sizes());
+  }
+
   const auto st = hidden_states.scalar_type();
   CHECK_INPUT(hidden_states);
-  CHECK_INPUT(fused_experts_out);
   CHECK_INPUT(w1);
   CHECK_INPUT(w2);
   CHECK_DIM(2, hidden_states);
   CHECK_DIM(2, w1);
   CHECK_DIM(2, w2);
-  CHECK_EQ(hidden_states.sizes(), fused_experts_out.sizes());
   CHECK_EQ(hidden_states.scalar_type(), st);
 
   int64_t M = hidden_states.size(0);
@@ -1232,7 +1185,7 @@ at::Tensor shared_expert_cpu(
   CHECK_EQ(packed_w2.size(1), packed_N);
 
   // check scales
-  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size, a1_scale, a2_scale);
+  check_moe_scales(use_int8_w8a8, use_fp8_w8a16, w1_scale, w2_scale, block_size);
 
   at::Tensor out_hidden_states = inplace ? hidden_states : at::empty_like(hidden_states);
 
@@ -1246,7 +1199,7 @@ at::Tensor shared_expert_cpu(
   //
   // for fp8 w8a16:
   //   5. intermediate_cache0 : [M, 2N]
-  //   6. B_tmp: [T, BLOCK_M, max(K, N)]
+  //   6. B_tmp: [T, MAX_CACHE_BLOCK_SIZE, BLOCK_M, max(K, N)]
   //
   int num_threads = at::get_num_threads();
   int64_t buffer_size_nbytes = M * N * 2 + num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float);
@@ -1255,7 +1208,7 @@ at::Tensor shared_expert_cpu(
     buffer_size_nbytes += std::max(M * K, M * N) + M * sizeof(float);
   }
   if (use_fp8_w8a16) {
-    buffer_size_nbytes += M * 2 * N * 2 + num_threads * BLOCK_M * std::max(K, N) * 2;
+    buffer_size_nbytes += M * 2 * N * 2 + num_threads * MAX_CACHE_BLOCK_SIZE * BLOCK_M * std::max(K, N) * 2;
   }
 
   auto buffer = at::empty({buffer_size_nbytes}, hidden_states.options().dtype(at::kChar));
@@ -1283,8 +1236,8 @@ at::Tensor shared_expert_cpu(
           packed_w2.data_ptr<int8_t>(),
           w1s.data_ptr<float>(),
           w2s.data_ptr<float>(),
-          fused_experts_out.data_ptr<scalar_t>(),
-          routed_scaling_factor,
+          conditional_data_ptr<scalar_t>(fused_experts_out),
+          routed_scaling_factor_value,
           M,
           N,
           K);
@@ -1306,8 +1259,8 @@ at::Tensor shared_expert_cpu(
           w2s.data_ptr<float>(),
           block_size_N,
           block_size_K,
-          fused_experts_out.data_ptr<scalar_t>(),
-          routed_scaling_factor,
+          conditional_data_ptr<scalar_t>(fused_experts_out),
+          routed_scaling_factor_value,
           M,
           N,
           K);
@@ -1319,8 +1272,8 @@ at::Tensor shared_expert_cpu(
           hidden_states.data_ptr<scalar_t>(),
           packed_w1.data_ptr<scalar_t>(),
           packed_w2.data_ptr<scalar_t>(),
-          fused_experts_out.data_ptr<scalar_t>(),
-          routed_scaling_factor,
+          conditional_data_ptr<scalar_t>(fused_experts_out),
+          routed_scaling_factor_value,
           M,
           N,
           K);
diff --git a/csrc/cpu/sgl-kernels/moe.h b/csrc/cpu/sgl-kernels/moe.h
new file mode 100644
index 000000000000..d80ed50cf9dd
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/moe.h
@@ -0,0 +1,178 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
+
+#pragma once
+#include "vec.h"
+
+template <typename scalar_t>
+inline void fill_stub(scalar_t* __restrict__ out, scalar_t val, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  const Vec data_vec(val);
+  at::vec::map<scalar_t>([data_vec](Vec out) { return out = data_vec; }, out, out, size);
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  constexpr int kVecSize = Vec::size();
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    Vec data = Vec::loadu(input + d);
+    data.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = input[d];
+  }
+}
+
+template <typename scalar_t>
+inline void copy_stub(scalar_t* __restrict__ out, const float* __restrict__ input, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    auto [x0, x1] = load_float_vec2(input + d);
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d]);
+  }
+}
+
+template <>
+inline void copy_stub<uint8_t>(uint8_t* __restrict__ out, const uint8_t* __restrict__ input, int64_t size) {
+  // size might be 64x + 32
+  std::memcpy(out, input, size * sizeof(uint8_t));
+}
+
+template <typename scalar_t, typename input_t>
+inline void copy_mul_stub(scalar_t* __restrict__ out, const input_t* __restrict__ input, float weight, int64_t size) {
+  static_assert(
+      std::is_same_v<input_t, float> || std::is_same_v<input_t, scalar_t>,
+      "copy_mul_stub only supports input_t == float or input_t == scalar_t");
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec weight_vec = fVec(weight);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    auto [x0, x1] = load_float_vec2(input + d);
+    x0 = x0 * weight_vec;
+    x1 = x1 * weight_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] * weight);
+  }
+}
+
+// acc from [topk, K] to [K]
+template <typename scalar_t>
+inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  if (topk == 1) {
+    // do copy for topk = 1
+    copy_stub(out, input, K);
+  } else {
+    // do sum for topk != 1
+    int64_t d;
+#pragma GCC unroll 4
+    for (d = 0; d <= K - kVecSize; d += kVecSize) {
+      fVec sum_fvec0 = fVec(0.f);
+      fVec sum_fvec1 = fVec(0.f);
+      for (int t = 0; t < topk; ++t) {
+        bVec x_bvec = bVec::loadu(input + t * K + d);
+        fVec x_fvec0, x_fvec1;
+        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
+
+        sum_fvec0 += x_fvec0;
+        sum_fvec1 += x_fvec1;
+      }
+      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
+      out_bvec.store(out + d);
+    }
+    for (; d < K; ++d) {
+      float sum_val = 0.f;
+      for (int t = 0; t < topk; ++t) {
+        sum_val += static_cast<float>(input[t * K + d]);
+      }
+      out[d] = static_cast<scalar_t>(sum_val);
+    }
+  }
+}
+
+// out = input + input2 * scale
+template <typename scalar_t, typename input_t>
+inline void add_mul_stub(
+    scalar_t* __restrict__ out,
+    const input_t* __restrict__ input,
+    const scalar_t* __restrict__ input2,
+    float scale,
+    int64_t size) {
+  static_assert(
+      std::is_same_v<input_t, float> || std::is_same_v<input_t, scalar_t>,
+      "add_mul_stub only supports input_t == float or input_t == scalar_t");
+
+  // out = input (without scale factor)
+  if (input2 == nullptr) {
+    copy_stub(out, input, size);
+    return;
+  }
+
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = bVec::size();
+  const fVec s_vec = fVec(scale);
+  int64_t d;
+#pragma GCC unroll 4
+  for (d = 0; d <= size - kVecSize; d += kVecSize) {
+    auto [x0, x1] = load_float_vec2(input + d);
+
+    bVec y_bvec = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
+
+    x0 = x0 + y0 * s_vec;
+    x1 = x1 + y1 * s_vec;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+  for (; d < size; ++d) {
+    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+  }
+}
+
+template <typename scalar_t>
+inline void silu_and_mul_stub(
+    scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const scalar_t* __restrict__ input2, int64_t size) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  const fVec one = fVec(1.f);
+
+  // no remainder
+#pragma GCC unroll 4
+  for (int64_t d = 0; d < size; d += bVec::size()) {
+    bVec x = bVec::loadu(input + d);
+    fVec x0, x1;
+    std::tie(x0, x1) = at::vec::convert_to_float(x);
+    bVec y = bVec::loadu(input2 + d);
+    fVec y0, y1;
+    std::tie(y0, y1) = at::vec::convert_to_float(y);
+    x0 = x0 / (one + x0.neg().exp_u20());
+    x1 = x1 / (one + x1.neg().exp_u20());
+    x0 = x0 * y0;
+    x1 = x1 * y1;
+    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+    out_vec.store(out + d);
+  }
+}
diff --git a/csrc/cpu/sgl-kernels/moe_fp8.cpp b/csrc/cpu/sgl-kernels/moe_fp8.cpp
index 84a6af267740..f93630ae2646 100644
--- a/csrc/cpu/sgl-kernels/moe_fp8.cpp
+++ b/csrc/cpu/sgl-kernels/moe_fp8.cpp
@@ -1,147 +1,11 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
-#include "common.h"
-#include "gemm.h"
-#include "vec.h"
-
 // clang-format off
 
-namespace {
-
-template <typename scalar_t>
-inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
-  using Vec = at::vec::Vectorized<scalar_t>;
-  // no remainder
-  #pragma GCC unroll 4
-  for (int64_t d = 0; d < size; d += Vec::size()) {
-    Vec data = Vec::loadu(input + d);
-    data.store(out + d);
-  }
-}
-
-template <typename scalar_t>
-inline void copy_mul_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, float weight, int64_t size) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec weight_vec = fVec(weight);
-  int64_t d;
-  #pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    bVec x = bVec::loadu(input + d);
-    fVec x0, x1;
-    std::tie(x0, x1) = at::vec::convert_to_float(x);
-    x0 = x0 * weight_vec;
-    x1 = x1 * weight_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] * weight);
-  }
-}
-
-// acc from [topk, K] to [K]
-template <typename scalar_t>
-inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  if (topk == 1) {
-    // do copy for topk = 1
-    copy_stub(out, input, K);
-  } else {
-    // do sum for topk != 1
-    int64_t d;
-    #pragma GCC unroll 4
-    for (d = 0; d <= K - kVecSize; d += kVecSize) {
-      fVec sum_fvec0 = fVec(0.f);
-      fVec sum_fvec1 = fVec(0.f);
-      for (int t = 0; t < topk; ++t) {
-        bVec x_bvec = bVec::loadu(input + t * K + d);
-        fVec x_fvec0, x_fvec1;
-        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
-
-        sum_fvec0 += x_fvec0;
-        sum_fvec1 += x_fvec1;
-      }
-      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
-      out_bvec.store(out + d);
-    }
-    for (; d < K; ++d) {
-      float sum_val = 0.f;
-      for (int t = 0; t < topk; ++t) {
-        sum_val += static_cast<float>(input[t * K + d]);
-      }
-      out[d] = static_cast<scalar_t>(sum_val);
-    }
-  }
-}
-
-// out = input + input2 * scale
-template <typename scalar_t>
-inline void add_mul_stub(
-    scalar_t* __restrict__ out,
-    const scalar_t* __restrict__ input,
-    const scalar_t* __restrict__ input2,
-    float scale,
-    int64_t size) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec s_vec = fVec(scale);
-
-  int64_t d;
-#pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    bVec x_bvec = bVec::loadu(input + d);
-    fVec x0, x1;
-    std::tie(x0, x1) = at::vec::convert_to_float(x_bvec);
-
-    bVec y_bvec = bVec::loadu(input2 + d);
-    fVec y0, y1;
-    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
-
-    x0 = x0 + y0 * s_vec;
-    x1 = x1 + y1 * s_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
-  }
-}
-
-template <typename scalar_t>
-inline void silu_and_mul_stub(
-    scalar_t* __restrict__ out,
-    const scalar_t* __restrict__ input,
-    const scalar_t* __restrict__ input2,
-    int64_t size) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  const fVec one = fVec(1.f);
-
-  // no remainder
-#pragma GCC unroll 4
-  for (int64_t d = 0; d < size; d += bVec::size()) {
-    bVec x = bVec::loadu(input + d);
-    fVec x0, x1;
-    std::tie(x0, x1) = at::vec::convert_to_float(x);
-    bVec y = bVec::loadu(input2 + d);
-    fVec y0, y1;
-    std::tie(y0, y1) = at::vec::convert_to_float(y);
-    x0 = x0 / (one + x0.neg().exp_u20());
-    x1 = x1 / (one + x1.neg().exp_u20());
-    x0 = x0 * y0;
-    x1 = x1 * y1;
-    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
-    out_vec.store(out + d);
-  }
-}
-
-} // anonymous namespace
+#include "common.h"
+#include "gemm.h"
+#include "moe.h"
 
 template <typename scalar_t>
 void fused_experts_fp8_kernel_impl(
@@ -169,7 +33,6 @@ void fused_experts_fp8_kernel_impl(
     int64_t E,
     int64_t topk,
     int64_t num_tokens_post_pad) {
-
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
 
@@ -183,35 +46,39 @@ void fused_experts_fp8_kernel_impl(
   const int64_t stride_e = 2 * N * K;
   const int64_t stride_n = K;
 
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(avg_M);
+
+  int64_t B_tmp_size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N);
+
   // here we only parallel on half of 2N to fuse silu_and_mul with gemm
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     scalar_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
-
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
 
       // B shape [K, n_size] in vnni format
       int32_t expert_id = expert_ids[mb];
       const at::Float8_e4m3fn* __restrict__ B = packed_w1 + expert_id * stride_e + nb * BLOCK_N * stride_n;
-      const float* __restrict__ Bs = w1s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+      const float* __restrict__ Bs =
+          w1s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
 
-      // 1.a load A
-      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
-      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      // do unpacking for the first row or a new expert
+      int32_t pre_expert_id = mb == 0 ? -1 : expert_ids[mb - 1];
+      bool do_unpack = (mb == mb0) || (expert_id != pre_expert_id);
 
-      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
 
-      for (int64_t m = 0; m < m_size; ++m) {
-        int32_t index = A_ids[m] / topk;
-        copy_stub(A + m * K, input + index * K, K);
+      if (nb_offset == 0) {
+        // 1.a load A
+        const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+        for (int64_t m = 0; m < m_size; ++m) {
+          int32_t index = A_ids[m] / topk;
+          copy_stub(A + m * K, input + index * K, K);
+        }
       }
 
       const int64_t offset = offsets[mb];
@@ -219,7 +86,7 @@ void fused_experts_fp8_kernel_impl(
           /*   A            */ A,
           /*   B            */ B,
           /*   C            */ ic0 + offset * 2 * N + nb * BLOCK_N,
-          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * K,
           /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
           /*   scale        */ Bs,
           /*   M            */ m_size,
@@ -229,10 +96,11 @@ void fused_experts_fp8_kernel_impl(
           /*   ldb          */ n_size,
           /*   ldc          */ 2 * N,
           /*   brg          */ use_brgemm,
-          /*   block_size_K */ block_size_K);
-    }
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
@@ -240,11 +108,7 @@ void fused_experts_fp8_kernel_impl(
   // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
   at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      silu_and_mul_stub(
-          ic1 + m * N,
-          ic0 + m * 2 * N,
-          ic0 + m * 2 * N + N,
-          N);
+      silu_and_mul_stub(ic1 + m * N, ic0 + m * 2 * N, ic0 + m * 2 * N + N, N);
     }
   });
 
@@ -260,22 +124,14 @@ void fused_experts_fp8_kernel_impl(
   const int64_t stride_oc = IC;
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
-    int tid = at::get_thread_num();
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
     alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
 
-    bool is_brgemm_used = false;
-
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
-
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = offsets[mb + 1] - offsets[mb];
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
-      const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(m_size);
-      is_brgemm_used = is_brgemm_used || use_brgemm;
-
       // A ptr from ic1 of [M * topk, N] in sorted order
       // so as to avoid copy A to tmp buffer again
       const scalar_t* __restrict__ A = ic1 + offsets[mb] * N;
@@ -284,13 +140,18 @@ void fused_experts_fp8_kernel_impl(
       // B shape [IC, n_size] in vnni format
       int32_t expert_id = expert_ids[mb];
       const at::Float8_e4m3fn* __restrict__ B = packed_w2 + expert_id * stride_e2 + nb * BLOCK_N * stride_oc;
-      const float* __restrict__ Bs = w2s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+      const float* __restrict__ Bs =
+          w2s + expert_id * scale_size_N * scale_size_K + (nb / blocks_n_per_group) * scale_size_K;
+
+      // do unpacking for the first row or a new expert
+      int32_t pre_expert_id = mb == 0 ? -1 : expert_ids[mb - 1];
+      bool do_unpack = (mb == mb0) || (expert_id != pre_expert_id);
 
       tinygemm_kernel<scalar_t>(
           /*   A            */ A,
           /*   B            */ B,
           /*   C            */ C,
-          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * IC,
           /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
           /*   scale        */ Bs,
           /*   M            */ m_size,
@@ -300,7 +161,8 @@ void fused_experts_fp8_kernel_impl(
           /*   ldb          */ n_size,
           /*   ldc          */ BLOCK_N,
           /*   brg          */ use_brgemm,
-          /*   block_size_K */ block_size_K);
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
 
       // 2.b copy from C to ic2 in original order
       //   and also mul topk_weights in float32
@@ -309,9 +171,9 @@ void fused_experts_fp8_kernel_impl(
         float weight = topk_weights[index];
         copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
       }
-    }
+    });
 
-    if (is_brgemm_used) {
+    if (use_brgemm) {
       at::native::cpublas::brgemm_release();
     }
   });
@@ -374,7 +236,6 @@ void shared_expert_fp8_kernel_impl(
     int64_t M,
     int64_t N,
     int64_t K) {
-
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
 
@@ -385,21 +246,25 @@ void shared_expert_fp8_kernel_impl(
   int64_t blocks_n_per_group = block_size_N / BLOCK_N;
 
   const bool use_brgemm = can_use_brgemm<at::Float8_e4m3fn>(M);
+  const bool apply_scaling_factor = fused_experts_out != nullptr;
 
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
-    int tid = at::get_thread_num();
+  int64_t B_tmp_size_per_thread = MAX_CACHE_BLOCK_SIZE * BLOCK_N * std::max(K, N);
 
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
+
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * K, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
       int64_t n_size = std::min(2 * N - nb * BLOCK_N, BLOCK_N);
 
+      // do unpacking for the first row
+      bool do_unpack = (mb == mb0);
+
       tinygemm_kernel<scalar_t>(
           /*   A            */ input + mb * BLOCK_M * K,
           /*   B            */ packed_w1 + nb * BLOCK_N * K,
           /*   C            */ ic0 + mb * BLOCK_M * 2 * N + nb * BLOCK_N,
-          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * K,
           /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
           /*   scale        */ w1s + (nb / blocks_n_per_group) * scale_size_K,
           /*   M            */ m_size,
@@ -409,8 +274,9 @@ void shared_expert_fp8_kernel_impl(
           /*   ldb          */ n_size,
           /*   ldc          */ 2 * N,
           /*   brg          */ use_brgemm,
-          /*   block_size_K */ block_size_K);
-    }
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
+    });
 
     if (use_brgemm) {
       at::native::cpublas::brgemm_release();
@@ -420,11 +286,7 @@ void shared_expert_fp8_kernel_impl(
   // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
   at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      silu_and_mul_stub(
-          ic1 + m * N,
-          ic0 + m * 2 * N,
-          ic0 + m * 2 * N + N,
-          N);
+      silu_and_mul_stub(ic1 + m * N, ic0 + m * 2 * N, ic0 + m * 2 * N + N, N);
     }
   });
 
@@ -437,22 +299,23 @@ void shared_expert_fp8_kernel_impl(
   scale_size_K = div_up(N, block_size_K);
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
-    int tid = at::get_thread_num();
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    int tid = get_thread_num();
     alignas(64) scalar_t C[BLOCK_M * BLOCK_K];
 
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
+    loop_2d<at::Float8_e4m3fn>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
+      // do unpacking for the first row
+      bool do_unpack = (mb == mb0);
+
       // 2.a gemm: C = A @ B
       tinygemm_kernel<scalar_t>(
           /*   A            */ ic1 + mb * BLOCK_M * N,
           /*   B            */ packed_w2 + nb * BLOCK_N * N,
           /*   C            */ C,
-          /*   Btmp         */ B_tmp + tid * BLOCK_N * std::max(K, N),
+          /*   Btmp         */ B_tmp + tid * B_tmp_size_per_thread + nb_offset * BLOCK_N * IC,
           /*   Ctmp         */ C_tmp + tid * 2 * BLOCK_M * BLOCK_N,
           /*   scale        */ w2s + (nb / blocks_n_per_group) * scale_size_K,
           /*   M            */ m_size,
@@ -462,15 +325,18 @@ void shared_expert_fp8_kernel_impl(
           /*   ldb          */ n_size,
           /*   ldc          */ BLOCK_N,
           /*   brg          */ use_brgemm,
-          /*   block_size_K */ block_size_K);
+          /*   block_size_K */ block_size_K,
+          /*   do_unpack    */ do_unpack);
 
       // 2.b copy from C to output and add fused_experts_out
       scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
-      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out =
+          apply_scaling_factor ? fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N : nullptr;
       for (int64_t m = 0; m < m_size; ++m) {
-        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+        const scalar_t* __restrict__ fused_out_row = apply_scaling_factor ? (fused_out + m * K) : nullptr;
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out_row, routed_scaling_factor, n_size);
       }
-    }
+    });
   });
 
   if (use_brgemm) {
diff --git a/csrc/cpu/sgl-kernels/moe_int4.cpp b/csrc/cpu/sgl-kernels/moe_int4.cpp
new file mode 100644
index 000000000000..c97784847a9e
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/moe_int4.cpp
@@ -0,0 +1,323 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
+
+#include "common.h"
+#include "gemm.h"
+#include "moe.h"
+
+template <int64_t N>
+inline void copy_bias(const float* bias_ptr, float* y_buf, int64_t m, int64_t ldn) {
+  using Vec = at::vec::Vectorized<float>;
+  constexpr int kVecSize = Vec::size();
+  static_assert(N % kVecSize == 0, "copy_bias requires N to be a multiple of Vectorized<float>::size()");
+  const bool has_bias = bias_ptr != nullptr;
+  const Vec zero_vec(0.f);
+  for (int i = 0; i < m; ++i) {
+#pragma GCC unroll 2
+    for (int j = 0; j < N; j += kVecSize) {
+      Vec vec = has_bias ? Vec::loadu(bias_ptr + j) : zero_vec;
+      vec.store(y_buf + i * ldn + j);
+    }
+  }
+}
+
+template <typename scalar_t>
+void fused_experts_int4_w4a8_kernel_impl(
+    scalar_t* __restrict__ output,
+    scalar_t* __restrict__ ic0,
+    scalar_t* __restrict__ ic1,
+    scalar_t* __restrict__ ic2,
+    uint8_t* __restrict__ A_tmp,
+    uint8_t* __restrict__ Aq_tmp,
+    float* __restrict__ As_tmp,
+    int32_t* __restrict__ Azp_tmp,
+    float* __restrict__ C_tmp,
+    int8_t* __restrict__ dqB_tmp,
+    const scalar_t* __restrict__ input,
+    const uint8_t* __restrict__ packed_w1,
+    const uint8_t* __restrict__ packed_w2,
+    const int8_t* __restrict__ w1z,
+    const int8_t* __restrict__ w2z,
+    const float* __restrict__ w1s,
+    const float* __restrict__ w2s,
+    int group_size,
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ offsets,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t E,
+    int64_t topk,
+    int64_t num_tokens_post_pad) {
+  constexpr int64_t BLOCK_M = block_size_m();
+  constexpr int64_t BLOCK_N = block_size_n();
+  int num_threads = at::get_num_threads();
+  // int64_t buffer_size_nbytes = M * topk * N * 2
+  //                              M * topk * K * 2 +
+  //                              num_threads * BLOCK_M * K +
+  //                              num_threads * 2 * BLOCK_M * BLOCK_N * sizeof(float)  +
+  //                              M * topk * 2 * N * 2 +
+  //                              max(M * K, M * topk * N)  +
+  //                              M * topk * sizeof(float);
+
+  // intermediate_cache1 (scalar_t):     START + M * topk * N
+  // intermediate_cache2 (scalar_t):     + M * topk * K
+  // A_tmp (uint8_t):                    + num_threads * BLOCK_M * K
+  // C_tmp (float):                      + num_threads * 2 * BLOCK_M * BLOCK_N
+  // intermediate_cache0 (scalar_t):     + M * topk * 2 * N
+  // Aq_tmp (uint8_t):                   + max(M * K, M * topk * N)
+  // As_tmp (float):                     + M * topk
+  // dqB_tmp (int8_t)                    + num_threads * _block_k * BlOCK_N
+
+  // stage 0: quantize input to uint8, [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * K, As_tmp[m], input + m * K, K);
+    }
+  });
+  int64_t _block_k = get_4bit_block_k_size(group_size);
+  auto Azp = at::ones({M * topk}).to(at::kInt).mul(128);
+  auto Azp_ptr = Azp.data_ptr<int32_t>();
+  // stage 1: intermediate_cache0 = hidden_states @ w1
+  const int64_t MB = div_up(num_tokens_post_pad, BLOCK_M);
+  const int64_t NB = div_up(N, BLOCK_N);
+
+  int64_t block_per_group = group_size / _block_k;
+  int64_t Kc = K / _block_k;
+  int64_t num_groups = K / group_size;
+
+  const int64_t stride_e = 2 * NB * Kc * (BLOCK_N * (_block_k / 2 + sizeof(int32_t)));
+  const bool sym_quant_act = false;
+  // weight + compensation shape = [E, Nc, Kc, block_n * _block_k / 2 + block_n*sizeof(int32_t)]
+  // scales/qzeros shape = [E, Nc, G, block_n]
+
+  // here we only parallel on half of 2N to fuse silu_and_mul with gemm
+  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+    // get local pointers
+    int tid = at::get_thread_num();
+    int8_t* dqB_tmp1 = dqB_tmp + tid * 2 * _block_k * BLOCK_N;
+    int8_t* dqB_tmp2 = dqB_tmp1 + _block_k * BLOCK_N;
+    alignas(64) float As[BLOCK_M];
+    uint8_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    float* __restrict__ C0 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    float* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+    bool is_brgemm_used = false;
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB;
+      int64_t nb = i % NB;
+      int64_t nb1 = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
+      // B shape [K, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const uint8_t* __restrict__ B = packed_w1 + expert_id * stride_e;
+      // Bz and Bs: [E, K/gs, 2N]
+      const int8_t* __restrict__ Bz = w1z + expert_id * (num_groups) * (2 * N);
+      const float* __restrict__ Bs = w1s + expert_id * (num_groups) * (2 * N);
+
+      // 1.a load A
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      const bool use_brgemm = can_use_brgemm<int8_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+      // copy to A [BLOCK_M, K]
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m] / topk;
+        copy_stub(A + m * K, Aq_tmp + index * K, K);
+        As[m] = As_tmp[index];
+      }
+      const int64_t offset = offsets[mb];
+      copy_bias<BLOCK_N>(nullptr, C0, m_size, BLOCK_N);
+      copy_bias<BLOCK_N>(nullptr, C1, m_size, BLOCK_N);
+      for (int kci = 0; kci < Kc; ++kci) {
+        int32_t* compensation_ptr =
+            sym_quant_act ? nullptr
+                          : (int32_t*)(void*)(B + (nb * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) +
+                                              _block_k * BLOCK_N / 2) /*Bcomp*/;
+        tinygemm_kernel<scalar_t>(
+            ic0 + offset * 2 * N + nb * BLOCK_N,
+            C0,
+            A + kci * _block_k,
+            As,
+            Azp_ptr,
+            B + (nb * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) /*B*/,
+            Bs + nb * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*scales_b*/,
+            Bz + nb * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*qzeros_b*/,
+            compensation_ptr,
+            dqB_tmp1,
+            m_size,
+            _block_k,
+            K,
+            BLOCK_N,
+            2 * N,
+            kci == Kc - 1,
+            use_brgemm);
+      }
+
+      for (int kci = 0; kci < Kc; ++kci) {
+        int32_t* compensation_ptr =
+            sym_quant_act ? nullptr
+                          : (int32_t*)(void*)(B + (nb1 * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) +
+                                              _block_k * BLOCK_N / 2) /*Bcomp*/;
+        tinygemm_kernel<scalar_t>(
+            ic0 + offset * 2 * N + nb1 * BLOCK_N,
+            C1,
+            A + kci * _block_k,
+            As,
+            Azp_ptr,
+            B + (nb1 * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) /*B*/,
+            Bs + nb1 * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*scales_b*/,
+            Bz + nb1 * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*qzeros_b*/,
+            compensation_ptr,
+            dqB_tmp2,
+            m_size,
+            _block_k,
+            K,
+            BLOCK_N,
+            2 * N,
+            kci == Kc - 1,
+            use_brgemm);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 1.5: intermediate_cache1 = silu(intermediate_cache0)
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      silu_and_mul_stub(ic1 + m * N, ic0 + m * 2 * N, ic0 + m * 2 * N + N, N);
+    }
+  });
+
+  // stage 1.5: quantize ic1 to uint8, [M * topk, N]
+  at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      quantize_row_int8<scalar_t>(Aq_tmp + m * N, As_tmp[m], ic1 + m * N, N);
+    }
+  });
+  // stage 2: intermediate_cache2 = intermediate_cache1 @ w2
+  //   w2 : [E, K, N] as [E, OC, IC]
+  const int64_t OC = K;  // rename K as OC
+  const int64_t IC = N;  // rename N as IC
+  const int64_t MB2 = MB;
+  const int64_t NB2 = div_up(OC, BLOCK_N);
+  const int64_t stride_oc = IC;
+  num_groups = IC / group_size;
+  Kc = IC / _block_k;
+  const int64_t stride_e2 = NB2 * Kc * (BLOCK_N * (_block_k / 2 + sizeof(int32_t)));
+  // parallel on [MB2, NB2]
+  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+    int tid = at::get_thread_num();
+    int8_t* dqB_tmp1 = dqB_tmp + tid * 2 * _block_k * BLOCK_N;
+    float* __restrict__ C2 = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    bool is_brgemm_used = false;
+    for (int64_t i = begin; i < end; ++i) {
+      int64_t mb = i / NB2;
+      int64_t nb = i % NB2;
+
+      int64_t m_size = offsets[mb + 1] - offsets[mb];
+      int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
+      const bool use_brgemm = can_use_brgemm<int8_t>(m_size);
+      is_brgemm_used = is_brgemm_used || use_brgemm;
+      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+
+      // B shape [IC, n_size] in vnni format
+      int32_t expert_id = expert_ids[mb];
+      const uint8_t* __restrict__ B = packed_w2 + expert_id * stride_e2;
+
+      // Bz and Bs: [E, IC/gs, OC]
+      const int8_t* __restrict__ Bz = w2z + expert_id * (num_groups)*OC;
+      const float* __restrict__ Bs = w2s + expert_id * (num_groups)*OC;
+
+      // A ptr from ic1 of [M * topk, N] in sorted order
+      // so as to avoid copy A to tmp buffer again
+      const uint8_t* __restrict__ A = Aq_tmp + offsets[mb] * IC;
+      const float* __restrict__ As = As_tmp + offsets[mb];
+      copy_bias<BLOCK_N>(nullptr, C2, m_size, BLOCK_N);
+      for (int kci = 0; kci < Kc; ++kci) {
+        int32_t* compensation_ptr =
+            sym_quant_act ? nullptr
+                          : (int32_t*)(void*)(B + (nb * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))) +
+                                              _block_k * BLOCK_N / 2) /*Bcomp*/;
+        tinygemm_kernel<scalar_t>(
+            nullptr, /*store_out is false*/
+            C2,
+            A + kci * _block_k,
+            As,
+            Azp_ptr,
+            B + (nb * Kc + kci) * (BLOCK_N * (_block_k / 2 + sizeof(int32_t))),
+            Bs + nb * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*scales_b*/,
+            Bz + nb * BLOCK_N * num_groups + kci / block_per_group * BLOCK_N /*zeros_b*/,
+            compensation_ptr,
+            dqB_tmp1,
+            m_size,
+            _block_k,
+            IC,
+            BLOCK_N,
+            BLOCK_N,
+            false,
+            use_brgemm);
+      }
+
+      // 2.b copy from C to ic2 in original order
+      //   and also mul topk_weights in float32
+      for (int64_t m = 0; m < m_size; ++m) {
+        int32_t index = A_ids[m];
+        float weight = topk_weights[index];
+        copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C2 + m * BLOCK_N, weight, n_size);
+      }
+    }
+
+    if (is_brgemm_used) {
+      at::native::cpublas::brgemm_release();
+    }
+  });
+
+  // stage 3: out = intermediate_cache2.sum(dim=1)
+  //   from [M, topk, K] to [M, K]
+  at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
+    for (int64_t m = begin; m < end; ++m) {
+      sum_stub(output + m * K, ic2 + m * topk * K, topk, K);
+    }
+  });
+}
+
+#define INSTANTIATE_MOE_INT4_W4A8_TEMPLATE(TYPE)           \
+  template void fused_experts_int4_w4a8_kernel_impl<TYPE>( \
+      TYPE* __restrict__ output,                           \
+      TYPE* __restrict__ ic0,                              \
+      TYPE* __restrict__ ic1,                              \
+      TYPE* __restrict__ ic2,                              \
+      uint8_t* __restrict__ A_tmp,                         \
+      uint8_t* __restrict__ Aq_tmp,                        \
+      float* __restrict__ As_tmp,                          \
+      int32_t* __restrict__ Azp_tmp,                       \
+      float* __restrict__ C_tmp,                           \
+      int8_t* __restrict__ dqB_tmp,                        \
+      const TYPE* __restrict__ input,                      \
+      const uint8_t* __restrict__ packed_w1,               \
+      const uint8_t* __restrict__ packed_w2,               \
+      const int8_t* __restrict__ w1z,                      \
+      const int8_t* __restrict__ w2z,                      \
+      const float* __restrict__ w1s,                       \
+      const float* __restrict__ w2s,                       \
+      int group_size,                                      \
+      const float* __restrict__ topk_weights,              \
+      const int32_t* __restrict__ sorted_ids,              \
+      const int32_t* __restrict__ expert_ids,              \
+      const int32_t* __restrict__ offsets,                 \
+      int64_t M,                                           \
+      int64_t N,                                           \
+      int64_t K,                                           \
+      int64_t E,                                           \
+      int64_t topk,                                        \
+      int64_t num_tokens_post_pad)
+
+INSTANTIATE_MOE_INT4_W4A8_TEMPLATE(at::BFloat16);
+INSTANTIATE_MOE_INT4_W4A8_TEMPLATE(at::Half);
diff --git a/csrc/cpu/sgl-kernels/moe_int8.cpp b/csrc/cpu/sgl-kernels/moe_int8.cpp
index e28b4fc4ee59..3bdd5892d0b4 100644
--- a/csrc/cpu/sgl-kernels/moe_int8.cpp
+++ b/csrc/cpu/sgl-kernels/moe_int8.cpp
@@ -1,124 +1,145 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
+// clang-format off
+
 #include "common.h"
-#include "vec.h"
 #include "gemm.h"
-
-// clang-format off
+#include "moe.h"
 
 namespace {
 
-template <typename scalar_t>
-inline void copy_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t size) {
-  using Vec = at::vec::Vectorized<scalar_t>;
-  // no remainder
-  #pragma GCC unroll 4
-  for (int64_t d = 0; d < size; d += Vec::size()) {
-    Vec data = Vec::loadu(input + d);
-    data.store(out + d);
-  }
-}
-
-template <>
-inline void copy_stub<uint8_t>(uint8_t* __restrict__ out, const uint8_t* __restrict__ input, int64_t size) {
-  // size might be 64x + 32
-  std::memcpy(out, input, size * sizeof(uint8_t));
-}
-
-template <typename scalar_t>
-inline void copy_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input, float weight, int64_t size) {
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec weight_vec = fVec(weight);
-  int64_t d;
-  #pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    fVec data0 = fVec::loadu(input + d) * weight_vec;
-    fVec data1 = fVec::loadu(input + d + fVec::size()) * weight_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(data0, data1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] * weight);
-  }
-}
+template <typename scalar_t, int BLOCK_N>
+inline void silu_and_mul(
+    scalar_t* __restrict__ C,
+    const int32_t* __restrict__ C0,  // x: x0, x1
+    const int32_t* __restrict__ C1,  // y: y0, y1
+    const float* __restrict__ As,
+    const float* __restrict__ Bs0,
+    const float* __restrict__ Bs1,
+    const int32_t* __restrict__ Bcomp0,
+    const int32_t* __restrict__ Bcomp1,
+    int64_t m_size,
+    int64_t N) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  static_assert(COLS % 2 == 0);
+
+  __m512 vc0[COLS];
+  __m512 vc1[COLS];
+  __m512i vcomp0[COLS];
+  __m512i vcomp1[COLS];
+  __m512 vas;
+  __m512 vbs0[COLS];
+  __m512 vbs1[COLS];
+
+  auto load_scale_and_comp = [&](auto col) {
+    vcomp0[col] = _mm512_loadu_si512(Bcomp0 + col * 16);
+    vcomp1[col] = _mm512_loadu_si512(Bcomp1 + col * 16);
+    vbs0[col] = _mm512_loadu_ps(Bs0 + col * 16);
+    vbs1[col] = _mm512_loadu_ps(Bs1 + col * 16);
+  };
+  Unroll<COLS>{}(load_scale_and_comp);
+
+  auto scalec = [&](auto col, int64_t m) {
+    // update As
+    vas = _mm512_set1_ps(As[m]);
+    // C = As * (C - Bcomp) * Bs
+    __m512i vc32_0 = _mm512_loadu_si512(C0 + m * BLOCK_N + col * 16);
+    __m512i vc32_1 = _mm512_loadu_si512(C1 + m * BLOCK_N + col * 16);
+    vc0[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32_0, vcomp0[col]));
+    vc1[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32_1, vcomp1[col]));
+    vc0[col] = _mm512_mul_ps(_mm512_mul_ps(vc0[col], vas), vbs0[col]);
+    vc1[col] = _mm512_mul_ps(_mm512_mul_ps(vc1[col], vas), vbs1[col]);
+  };
 
-// acc from [topk, K] to [K]
-template <typename scalar_t>
-inline void sum_stub(scalar_t* __restrict__ out, const scalar_t* __restrict__ input, int64_t topk, int64_t K) {
   using bVec = at::vec::Vectorized<scalar_t>;
   using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  if (topk == 1) {
-    // do copy for topk = 1
-    copy_stub(out, input, K);
-  } else {
-    // do sum for topk != 1
-    int64_t d;
-    #pragma GCC unroll 4
-    for (d = 0; d <= K - kVecSize; d += kVecSize) {
-      fVec sum_fvec0 = fVec(0.f);
-      fVec sum_fvec1 = fVec(0.f);
-      for (int t = 0; t < topk; ++t) {
-        bVec x_bvec = bVec::loadu(input + t * K + d);
-        fVec x_fvec0, x_fvec1;
-        std::tie(x_fvec0, x_fvec1) = at::vec::convert_to_float(x_bvec);
-
-        sum_fvec0 += x_fvec0;
-        sum_fvec1 += x_fvec1;
-      }
-      bVec out_bvec = convert_from_float_ext<scalar_t>(sum_fvec0, sum_fvec1);
-      out_bvec.store(out + d);
-    }
-    for (; d < K; ++d) {
-      float sum_val = 0.f;
-      for (int t = 0; t < topk; ++t) {
-        sum_val += static_cast<float>(input[t * K + d]);
-      }
-      out[d] = static_cast<scalar_t>(sum_val);
+  const fVec one = fVec(1.f);
+  auto silu_and_mul = [&](auto col) {
+    fVec x = fVec(vc0[col]);
+    fVec y = fVec(vc1[col]);
+    x = x / (one + x.neg().exp_u20());
+    vc0[col] = x * y;
+  };
+
+  auto storec = [&](auto col, int64_t m) {
+    if constexpr (col % 2 == 0) {
+      fVec x0 = fVec(vc0[col + 0]);
+      fVec x1 = fVec(vc0[col + 1]);
+      bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
+      out_vec.store(C + m * N + col * 16);
     }
+  };
+
+  for (int64_t m = 0; m < m_size; ++m) {
+    Unroll<COLS>{}(scalec, m);
+    Unroll<COLS>{}(silu_and_mul);
+    Unroll<COLS>{}(storec, m);
   }
+#else
+  TORCH_CHECK(false, "silu_and_mul: scalar path not implemented!");
+#endif
 }
 
-// out = input + input2 * scale
-template <typename scalar_t>
-inline void add_mul_stub(scalar_t* __restrict__ out, const float* __restrict__ input,
-    const scalar_t* __restrict__ input2, float scale, int64_t size) {
-
-  using bVec = at::vec::Vectorized<scalar_t>;
-  using fVec = at::vec::Vectorized<float>;
-  constexpr int kVecSize = bVec::size();
-  const fVec s_vec = fVec(scale);
-  int64_t d;
-  #pragma GCC unroll 4
-  for (d = 0; d <= size - kVecSize; d += kVecSize) {
-    fVec x0 = fVec::loadu(input + d);
-    fVec x1 = fVec::loadu(input + d + fVec::size());
-
-    bVec y_bvec = bVec::loadu(input2 + d);
-    fVec y0, y1;
-    std::tie(y0, y1) = at::vec::convert_to_float(y_bvec);
-
-    x0 = x0 + y0 * s_vec;
-    x1 = x1 + y1 * s_vec;
-    bVec out_vec = convert_from_float_ext<scalar_t>(x0, x1);
-    out_vec.store(out + d);
-  }
-  for (; d < size; ++d) {
-    out[d] = static_cast<scalar_t>(input[d] + float(input2[d]) * scale);
+template <int BLOCK_N>
+inline void scale_C(
+    float* __restrict__ C,
+    const int32_t* __restrict__ Ctmp,
+    const float* __restrict__ As,
+    const float* __restrict__ Bs,
+    const int32_t* __restrict__ Bcomp,
+    int64_t m_size) {
+#if defined(CPU_CAPABILITY_AVX512)
+  constexpr int COLS = BLOCK_N / 16;
+  static_assert(COLS % 2 == 0);
+
+  __m512 vc[COLS];
+  __m512i vcomp[COLS];
+  __m512 vas;
+  __m512 vbs[COLS];
+
+  auto load_scale_and_comp = [&](auto col) {
+    vcomp[col] = _mm512_loadu_si512(Bcomp + col * 16);
+    vbs[col] = _mm512_loadu_ps(Bs + col * 16);
+  };
+  Unroll<COLS>{}(load_scale_and_comp);
+
+  auto scalec = [&](auto col, int64_t m) {
+    // update As
+    vas = _mm512_set1_ps(As[m]);
+    // C = As * (C - Bcomp) * Bs
+    __m512i vc32 = _mm512_loadu_si512(Ctmp + m * BLOCK_N + col * 16);
+    vc[col] = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc32, vcomp[col]));
+    vc[col] = _mm512_mul_ps(_mm512_mul_ps(vc[col], vas), vbs[col]);
+    _mm512_storeu_ps(C + m * BLOCK_N + col * 16, vc[col]);
+  };
+
+  for (int64_t m = 0; m < m_size; ++m) {
+    Unroll<COLS>{}(scalec, m);
   }
+#else
+  TORCH_CHECK(false, "scale_C: scalar path not implemented!");
+#endif
 }
 
 /// gemm for w13
 template <typename scalar_t, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_vnni {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, scalar_t* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
-      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B0,
+      const int8_t* __restrict__ B1,
+      scalar_t* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs0,
+      const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0,
+      const int32_t* __restrict__ Bcomp1,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -127,11 +148,19 @@ struct tinygemm_kernel_vnni {
 template <int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B0, const int8_t* __restrict__ B1, at::BFloat16* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs0, const float* __restrict__ Bs1,
-      const int32_t* __restrict__ Bcomp0, const int32_t* __restrict__ Bcomp1,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B0,
+      const int8_t* __restrict__ B1,
+      at::BFloat16* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs0,
+      const float* __restrict__ Bs1,
+      const int32_t* __restrict__ Bcomp0,
+      const int32_t* __restrict__ Bcomp1,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
     static_assert(COLS % 2 == 0);
@@ -143,9 +172,9 @@ struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
     __m512i vc1[ROWS * COLS];
     __m512i vcomp0[COLS];
     __m512i vcomp1[COLS];
-    __m512  was;
-    __m512  vbs0[COLS];
-    __m512  vbs1[COLS];
+    __m512 vas;
+    __m512 vbs0[COLS];
+    __m512 vbs1[COLS];
 
     auto loadc = [&](auto i) {
       vc0[i] = _mm512_set1_epi32(0);
@@ -155,7 +184,7 @@ struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
 
     const int64_t K4 = K >> 2;
     const int64_t lda4 = lda >> 2;
-    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
     const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
     const int32_t* b0_ptr = reinterpret_cast<const int32_t*>(B0);
     const int32_t* b1_ptr = reinterpret_cast<const int32_t*>(B1);
@@ -183,8 +212,8 @@ struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
       constexpr int col = i % COLS;
 
       // load a scale
-      if constexpr(col == 0) {
-        was = _mm512_set1_ps(As[row]);
+      if constexpr (col == 0) {
+        vas = _mm512_set1_ps(As[row]);
       }
       // load b scale and vcomp
       if constexpr (row == 0) {
@@ -195,8 +224,8 @@ struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
       }
       __m512 c0 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc0[i], vcomp0[col]));
       __m512 c1 = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc1[i], vcomp1[col]));
-      vc0[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c0, was), vbs0[col]));
-      vc1[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c1, was), vbs1[col]));
+      vc0[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c0, vas), vbs0[col]));
+      vc1[i] = _mm512_castps_si512(_mm512_mul_ps(_mm512_mul_ps(c1, vas), vbs1[col]));
     };
     Unroll<ROWS * COLS>{}(scalec);
 
@@ -221,19 +250,28 @@ struct tinygemm_kernel_vnni<at::BFloat16, BLOCK_M, BLOCK_N> {
         _mm512_storeu_si512(
             reinterpret_cast<__m512i*>((C + row * ldc + col * 16)),
             (__m512i)(_mm512_cvtne2ps_pbh(__m512(x1), __m512(x0))));
-        }
+      }
     };
     Unroll<ROWS * COLS>{}(storec);
   }
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_VNNI(MB_SIZE, NB_SIZE)                        \
-    tinygemm_kernel_vnni<scalar_t, MB_SIZE, NB_SIZE>::apply(                 \
-        A + mb_start * lda, B0 + nb_start * 4, B1 + nb_start * 4,            \
-        C + mb_start * ldc + nb_start, As + mb_start,                        \
-        Bs0 + nb_start, Bs1 + nb_start, Bcomp0 + nb_start, Bcomp1 + nb_start,\
-        K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_VNNI(MB_SIZE, NB_SIZE)      \
+  tinygemm_kernel_vnni<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                  \
+      B0 + nb_start * 4,                                   \
+      B1 + nb_start * 4,                                   \
+      C + mb_start * ldc + nb_start,                       \
+      As + mb_start,                                       \
+      Bs0 + nb_start,                                      \
+      Bs1 + nb_start,                                      \
+      Bcomp0 + nb_start,                                   \
+      Bcomp1 + nb_start,                                   \
+      K,                                                   \
+      lda,                                                 \
+      ldb,                                                 \
+      ldc);
 
 template <typename scalar_t>
 void tinygemm_kernel(
@@ -250,7 +288,6 @@ void tinygemm_kernel(
     int64_t lda,
     int64_t ldb,
     int64_t ldc) {
-
   const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
   const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
 
@@ -266,12 +303,21 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI(1, 32); break;
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32); break;
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32); break;
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_VNNI(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
@@ -281,9 +327,16 @@ void tinygemm_kernel(
 template <typename scalar_t, int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_vnni2 {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      float* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     TORCH_CHECK(false, "tinygemm_kernel_nn: scalar path not implemented!");
   }
 };
@@ -292,10 +345,16 @@ struct tinygemm_kernel_vnni2 {
 template <int BLOCK_M, int BLOCK_N>
 struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
   static inline void apply(
-      const uint8_t* __restrict__ A, const int8_t* __restrict__ B, float* __restrict__ C,
-      const float* __restrict__ As, const float* __restrict__ Bs, const int32_t* __restrict__ Bcomp,
-      int64_t K, int64_t lda, int64_t ldb, int64_t ldc) {
-
+      const uint8_t* __restrict__ A,
+      const int8_t* __restrict__ B,
+      float* __restrict__ C,
+      const float* __restrict__ As,
+      const float* __restrict__ Bs,
+      const int32_t* __restrict__ Bcomp,
+      int64_t K,
+      int64_t lda,
+      int64_t ldb,
+      int64_t ldc) {
     constexpr int ROWS = BLOCK_M;
     constexpr int COLS = BLOCK_N / 16;
     static_assert(COLS % 2 == 0);
@@ -304,17 +363,15 @@ struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
     __m512i vb[COLS];
     __m512i vc[ROWS * COLS];
     __m512i vcomp[COLS];
-    __m512  was;
-    __m512  vbs[COLS];
+    __m512 vas;
+    __m512 vbs[COLS];
 
-    auto loadc = [&](auto i) {
-      vc[i] = _mm512_set1_epi32(0);
-    };
+    auto loadc = [&](auto i) { vc[i] = _mm512_set1_epi32(0); };
     Unroll<ROWS * COLS>{}(loadc);
 
     const int64_t K4 = K >> 2;
     const int64_t lda4 = lda >> 2;
-    const int64_t ldb4 = ldb; // ldb * 4 >> 2;
+    const int64_t ldb4 = ldb;  // ldb * 4 >> 2;
     const int32_t* a_ptr = reinterpret_cast<const int32_t*>(A);
     const int32_t* b_ptr = reinterpret_cast<const int32_t*>(B);
 
@@ -339,8 +396,8 @@ struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
       constexpr int col = i % COLS;
 
       // load a scale
-      if constexpr(col == 0) {
-        was = _mm512_set1_ps(As[row]);
+      if constexpr (col == 0) {
+        vas = _mm512_set1_ps(As[row]);
       }
       // load b scale and vcomp per 2 vectors
       // also load bias if any
@@ -353,7 +410,7 @@ struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
         }
       }
       __m512 x = _mm512_cvtepi32_ps(_mm512_sub_epi32(vc[i], vcomp[col]));
-      x = _mm512_mul_ps(_mm512_mul_ps(x, was), vbs[col]);
+      x = _mm512_mul_ps(_mm512_mul_ps(x, vas), vbs[col]);
       _mm512_storeu_ps(reinterpret_cast<__m512*>(C + row * ldc + col * 16), x);
     };
     Unroll<ROWS * COLS>{}(storec);
@@ -361,11 +418,18 @@ struct tinygemm_kernel_vnni2<at::BFloat16, BLOCK_M, BLOCK_N> {
 };
 #endif
 
-#define LAUNCH_TINYGEMM_KERNEL_VNNI2(MB_SIZE, NB_SIZE)                       \
-    tinygemm_kernel_vnni2<scalar_t, MB_SIZE, NB_SIZE>::apply(                \
-        A + mb_start * lda, B + nb_start * 4, C + mb_start * ldc + nb_start, \
-        As + mb_start, Bs + nb_start, Bcomp + nb_start,                      \
-        K, lda, ldb, ldc);
+#define LAUNCH_TINYGEMM_KERNEL_VNNI2(MB_SIZE, NB_SIZE)      \
+  tinygemm_kernel_vnni2<scalar_t, MB_SIZE, NB_SIZE>::apply( \
+      A + mb_start * lda,                                   \
+      B + nb_start * 4,                                     \
+      C + mb_start * ldc + nb_start,                        \
+      As + mb_start,                                        \
+      Bs + nb_start,                                        \
+      Bcomp + nb_start,                                     \
+      K,                                                    \
+      lda,                                                  \
+      ldb,                                                  \
+      ldc);
 
 template <typename scalar_t>
 void tinygemm_kernel(
@@ -380,7 +444,6 @@ void tinygemm_kernel(
     int64_t lda,
     int64_t ldb,
     int64_t ldc) {
-
   // B compensation
   const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * K);
 
@@ -396,18 +459,27 @@ void tinygemm_kernel(
       int64_t nb_start = nb * BLOCK_N;
       int64_t nb_size = std::min(BLOCK_N, N - nb_start);
 
-      switch(mb_size << 4 | nb_size >> 4) {
-        case 0x12: LAUNCH_TINYGEMM_KERNEL_VNNI2(1, 32); break;
-        case 0x22: LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32); break;
-        case 0x32: LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32); break;
-        case 0x42: LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32); break;
-        default: TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", nb_size);
+      switch (mb_size << 4 | nb_size >> 4) {
+        case 0x12:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(1, 32);
+          break;
+        case 0x22:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(2, 32);
+          break;
+        case 0x32:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(3, 32);
+          break;
+        case 0x42:
+          LAUNCH_TINYGEMM_KERNEL_VNNI2(4, 32);
+          break;
+        default:
+          TORCH_CHECK(false, "Unexpected block size, ", mb_size, "x", "nb_size");
       }
     }
   }
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 template <typename scalar_t>
 void fused_experts_int8_kernel_impl(
@@ -433,7 +505,6 @@ void fused_experts_int8_kernel_impl(
     int64_t E,
     int64_t topk,
     int64_t num_tokens_post_pad) {
-
   // handle 2 tiles per block
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
@@ -441,11 +512,7 @@ void fused_experts_int8_kernel_impl(
   // stage 0: quantize input to uint8, [M, K]
   at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      quantize_row_int8<scalar_t>(
-          Aq_tmp + m * K,
-          As_tmp[m],
-          input + m * K,
-          K);
+      quantize_row_int8<scalar_t>(Aq_tmp + m * K, As_tmp[m], input + m * K, K);
     }
   });
 
@@ -462,66 +529,107 @@ void fused_experts_int8_kernel_impl(
 
   const int64_t stride_e = 2 * N * packed_K;
   const int64_t stride_n = packed_K;
+
+  int64_t avg_M = std::max(int64_t(1), M * topk / E);
+  const bool use_brgemm = can_use_brgemm<int8_t>(avg_M);
+
   // here we only parallel on half of 2N to fuse silu_and_mul with gemm
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
+    int tid = get_thread_num();
     uint8_t* __restrict__ A = A_tmp + tid * BLOCK_M * K;
+    int32_t* __restrict__ C0 = reinterpret_cast<int32_t*>(C_tmp) + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
 
     alignas(64) float As[BLOCK_M];
 
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
-
-      // nb0 from top half and nb1 from bottom half
-      int64_t nb0 = nb, nb1 = nb + NB;
-      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
 
       // B shape [K, n_size] in vnni format
       int32_t expert_id = expert_ids[mb];
-      const int8_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb0 * BLOCK_N * stride_n;
-      const int8_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb1 * BLOCK_N * stride_n;
-      const float* __restrict__ Bs0 = w1s + expert_id * 2 * N + nb0 * BLOCK_N;
-      const float* __restrict__ Bs1 = w1s + expert_id * 2 * N + nb1 * BLOCK_N;
+      const int8_t* __restrict__ B0 = packed_w1 + expert_id * stride_e + nb_upper * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + expert_id * stride_e + nb_lower * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + expert_id * 2 * N + nb_upper * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + expert_id * 2 * N + nb_lower * BLOCK_N;
 
-      // 1.a load A
-      const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
       int64_t m_size = offsets[mb + 1] - offsets[mb];
 
-      for (int64_t m = 0; m < m_size; ++m) {
-        int32_t index = A_ids[m] / topk;
-        copy_stub(A + m * K, Aq_tmp + index * K, K);
-        As[m] = As_tmp[index];
+      if (nb_offset == 0) {
+        // 1.a load A
+        const int32_t* A_ids = sorted_ids + mb * BLOCK_M;
+        for (int64_t m = 0; m < m_size; ++m) {
+          int32_t index = A_ids[m] / topk;
+          copy_stub(A + m * K, Aq_tmp + index * K, K);
+          As[m] = As_tmp[index];
+        }
       }
 
-      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
-      const int64_t offset = offsets[mb];
-      tinygemm_kernel(
-          /* A     */ A,
-          /* B0    */ B0,
-          /* B1    */ B1,
-          /* C     */ ic1 + offset * N + nb * BLOCK_N,
-          /* As    */ As,
-          /* Bs0   */ Bs0,
-          /* Bs1   */ Bs1,
-          /* M     */ m_size,
-          /* N     */ n_size,
-          /* K     */ K,
-          /* lda   */ K,
-          /* ldb   */ n_size,
-          /* ldc   */ N);
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+        const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+        // 1.d silu and mul
+        const int64_t offset = offsets[mb];
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + offset * N + nb * BLOCK_N, C0, C1, As, Bs0, Bs1, Bcomp0, Bcomp1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        const int64_t offset = offsets[mb];
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + offset * N + nb * BLOCK_N,
+            /* As    */ As,
+            /* Bs0   */ Bs0,
+            /* Bs1   */ Bs1,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
     }
   });
 
   // stage 1.5: quantize ic1 to uint8, [M * topk, N]
   at::parallel_for(0, M * topk, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      quantize_row_int8<scalar_t>(
-          Aq_tmp + m * N,
-          As_tmp[m],
-          ic1 + m * N,
-          N);
+      quantize_row_int8<scalar_t>(Aq_tmp + m * N, As_tmp[m], ic1 + m * N, N);
     }
   });
 
@@ -535,16 +643,13 @@ void fused_experts_int8_kernel_impl(
   const int64_t stride_oc = packed_N;
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
-    // we won't be using C1 for gemm2
+    int tid = get_thread_num();
     float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C32 = reinterpret_cast<int32_t*>(C + BLOCK_M * BLOCK_N);
 
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
-
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = offsets[mb + 1] - offsets[mb];
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
@@ -560,18 +665,36 @@ void fused_experts_int8_kernel_impl(
       const float* __restrict__ Bs = w2s + expert_id * K + nb * BLOCK_N;
 
       // 2.a gemm: C = A @ B
-      tinygemm_kernel<scalar_t>(
-          /* A     */ A,
-          /* B     */ B,
-          /* C     */ C,
-          /* As    */ As,
-          /* Bs    */ Bs,
-          /* M     */ m_size,
-          /* N     */ n_size,
-          /* K     */ IC,
-          /* lda   */ IC,
-          /* ldb   */ n_size,
-          /* ldc   */ BLOCK_N);
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C32);
+
+        // apply scales
+        const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * IC);
+        scale_C<BLOCK_N>(C, C32, As, Bs, Bcomp, m_size);
+      } else {
+        tinygemm_kernel<scalar_t>(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* As    */ As,
+            /* Bs    */ Bs,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
 
       // 2.b copy from C to ic2 in original order
       //   and also mul topk_weights in float32
@@ -580,6 +703,10 @@ void fused_experts_int8_kernel_impl(
         float weight = topk_weights[index];
         copy_mul_stub(ic2 + index * K + nb * BLOCK_N, C + m * BLOCK_N, weight, n_size);
       }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
     }
   });
 
@@ -592,17 +719,30 @@ void fused_experts_int8_kernel_impl(
   });
 }
 
-#define INSTANTIATE_MOE_INT8_TEMPLATE(TYPE)                                                  \
-  template void fused_experts_int8_kernel_impl<TYPE> (                                       \
-      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
-      TYPE* __restrict__ ic2, uint8_t* __restrict__ A_tmp,                                   \
-      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
-      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
-      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
-      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
-      const float* __restrict__ topk_weights, const int32_t* __restrict__ sorted_ids,        \
-      const int32_t* __restrict__ expert_ids, const int32_t* __restrict__ offsets,           \
-      int64_t M, int64_t N, int64_t K, int64_t E, int64_t topk, int64_t num_tokens_post_pad)
+#define INSTANTIATE_MOE_INT8_TEMPLATE(TYPE)           \
+  template void fused_experts_int8_kernel_impl<TYPE>( \
+      TYPE* __restrict__ output,                      \
+      TYPE* __restrict__ ic1,                         \
+      TYPE* __restrict__ ic2,                         \
+      uint8_t* __restrict__ A_tmp,                    \
+      float* __restrict__ C_tmp,                      \
+      uint8_t* __restrict__ Aq_tmp,                   \
+      float* __restrict__ As_tmp,                     \
+      const TYPE* __restrict__ input,                 \
+      const int8_t* __restrict__ packed_w1,           \
+      const int8_t* __restrict__ packed_w2,           \
+      const float* __restrict__ w1s,                  \
+      const float* __restrict__ w2s,                  \
+      const float* __restrict__ topk_weights,         \
+      const int32_t* __restrict__ sorted_ids,         \
+      const int32_t* __restrict__ expert_ids,         \
+      const int32_t* __restrict__ offsets,            \
+      int64_t M,                                      \
+      int64_t N,                                      \
+      int64_t K,                                      \
+      int64_t E,                                      \
+      int64_t topk,                                   \
+      int64_t num_tokens_post_pad)
 
 INSTANTIATE_MOE_INT8_TEMPLATE(at::BFloat16);
 INSTANTIATE_MOE_INT8_TEMPLATE(at::Half);
@@ -624,7 +764,6 @@ void shared_expert_int8_kernel_impl(
     int64_t M,
     int64_t N,
     int64_t K) {
-
   // handle 2 tiles per block
   constexpr int64_t BLOCK_M = block_size_m();
   constexpr int64_t BLOCK_N = block_size_n();
@@ -632,15 +771,11 @@ void shared_expert_int8_kernel_impl(
   // stage 0: quantize input to uint8, [M, K]
   at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      quantize_row_int8<scalar_t>(
-          Aq_tmp + m * K,
-          As_tmp[m],
-          input + m * K,
-          K);
+      quantize_row_int8<scalar_t>(Aq_tmp + m * K, As_tmp[m], input + m * K, K);
     }
   });
 
-   // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
+  // stage 1: intermediate_cache1 = silu(hidden_states @ w1)
   const int64_t MB = div_up(M, BLOCK_M);
   const int64_t NB = div_up(N, BLOCK_N);
 
@@ -651,15 +786,20 @@ void shared_expert_int8_kernel_impl(
   const int64_t packed_N = get_row_size<int8_t>(N);
   const int64_t stride_n = packed_K;
 
+  const bool use_brgemm = can_use_brgemm<int8_t>(M);
+  const bool apply_scaling_factor = fused_experts_out != nullptr;
+
   // here we only parallel on half of 2N to fuse silu_and_mul with gemm
-  at::parallel_for(0, MB * NB, 0, [&](int64_t begin, int64_t end) {
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB;
-      int64_t nb = i % NB;
-
-      // nb0 from top half and nb1 from bottom half
-      int64_t nb0 = nb, nb1 = nb + NB;
-      int64_t n_size = std::min(N - nb0 * BLOCK_N, BLOCK_N);
+  parallel_2d(MB, NB, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
+    // get local pointers
+    int tid = get_thread_num();
+    int32_t* __restrict__ C0 = reinterpret_cast<int32_t*>(C_tmp) + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C1 = C0 + BLOCK_M * BLOCK_N;
+
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * K * 2, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
+      // nb_upper from top half and nb_lower from bottom half
+      int64_t nb_upper = nb, nb_lower = nb + NB;
+      int64_t n_size = std::min(N - nb * BLOCK_N, BLOCK_N);
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
 
       // A shape [m_size, K]
@@ -667,37 +807,72 @@ void shared_expert_int8_kernel_impl(
       const float* As = As_tmp + mb * BLOCK_M;
 
       // B shape [K, n_size] in vnni format
-      const int8_t* __restrict__ B0 = packed_w1 + nb0 * BLOCK_N * stride_n;
-      const int8_t* __restrict__ B1 = packed_w1 + nb1 * BLOCK_N * stride_n;
-      const float* __restrict__ Bs0 = w1s + nb0 * BLOCK_N;
-      const float* __restrict__ Bs1 = w1s + nb1 * BLOCK_N;
-
-      // fused 1.b: silu_and_mul(A @ B0, A @ B1)
-      tinygemm_kernel(
-          /* A     */ A,
-          /* B0    */ B0,
-          /* B1    */ B1,
-          /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
-          /* As    */ As,
-          /* Bs0   */ Bs0,
-          /* Bs1   */ Bs1,
-          /* M     */ m_size,
-          /* N     */ n_size,
-          /* K     */ K,
-          /* lda   */ K,
-          /* ldb   */ n_size,
-          /* ldc   */ N);
+      const int8_t* __restrict__ B0 = packed_w1 + nb_upper * BLOCK_N * stride_n;
+      const int8_t* __restrict__ B1 = packed_w1 + nb_lower * BLOCK_N * stride_n;
+      const float* __restrict__ Bs0 = w1s + nb_upper * BLOCK_N;
+      const float* __restrict__ Bs1 = w1s + nb_lower * BLOCK_N;
+
+      if (use_brgemm) {
+        // 1.b gemm: C0 = A @ B0
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B0,
+            /* C     */ C0);
+
+        // 1.c gemm: C1 = A @ B1
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B1,
+            /* C     */ C1);
+
+        const int32_t* Bcomp0 = reinterpret_cast<const int32_t*>(B0 + block_size_n() * K);
+        const int32_t* Bcomp1 = reinterpret_cast<const int32_t*>(B1 + block_size_n() * K);
+
+        // 1.d silu and mul
+        silu_and_mul<scalar_t, BLOCK_N>(
+            ic1 + mb * BLOCK_M * N + nb * BLOCK_N, C0, C1, As, Bs0, Bs1, Bcomp0, Bcomp1, m_size, N);
+      } else {
+        // fused 1.bcd: silu_and_mul(A @ B0, A @ B1)
+        tinygemm_kernel(
+            /* A     */ A,
+            /* B0    */ B0,
+            /* B1    */ B1,
+            /* C     */ ic1 + mb * BLOCK_M * N + nb * BLOCK_N,
+            /* As    */ As,
+            /* Bs0   */ Bs0,
+            /* Bs1   */ Bs1,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ K,
+            /* lda   */ K,
+            /* ldb   */ n_size,
+            /* ldc   */ N);
+      }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
     }
   });
 
   // stage 1.5: quantize ic1 to uint8, [M * topk, N]
   at::parallel_for(0, M, 0, [&](int64_t begin, int64_t end) {
     for (int64_t m = begin; m < end; ++m) {
-      quantize_row_int8<scalar_t>(
-          Aq_tmp + m * N,
-          As_tmp[m],
-          ic1 + m * N,
-          N);
+      quantize_row_int8<scalar_t>(Aq_tmp + m * N, As_tmp[m], ic1 + m * N, N);
     }
   });
 
@@ -710,16 +885,13 @@ void shared_expert_int8_kernel_impl(
   const int64_t stride_oc = packed_N;
 
   // parallel on [MB2, NB2]
-  at::parallel_for(0, MB2 * NB2, 0, [&](int64_t begin, int64_t end) {
+  parallel_2d(MB2, NB2, [&](int64_t mb0, int64_t mb1, int64_t nb0, int64_t nb1) {
     // get local pointers
-    int tid = at::get_thread_num();
-    // we won't be using C1 for gemm2
+    int tid = get_thread_num();
     float* __restrict__ C = C_tmp + tid * 2 * BLOCK_M * BLOCK_N;
+    int32_t* __restrict__ C32 = reinterpret_cast<int32_t*>(C + BLOCK_M * BLOCK_N);
 
-    for (int64_t i = begin; i < end; ++i) {
-      int64_t mb = i / NB2;
-      int64_t nb = i % NB2;
-
+    loop_2d<int8_t>(mb0, mb1, nb0, nb1, BLOCK_N * IC, [&](int64_t mb, int64_t nb, int64_t nb_offset) {
       int64_t m_size = std::min(M - mb * BLOCK_M, BLOCK_M);
       int64_t n_size = std::min(OC - nb * BLOCK_N, BLOCK_N);
 
@@ -731,39 +903,71 @@ void shared_expert_int8_kernel_impl(
       const int8_t* __restrict__ B = packed_w2 + nb * BLOCK_N * stride_oc;
       const float* __restrict__ Bs = w2s + nb * BLOCK_N;
 
-      // 2.a gemm: C = A @ B
-      tinygemm_kernel<scalar_t>(
-          /* A     */ A,
-          /* B     */ B,
-          /* C     */ C,
-          /* As    */ As,
-          /* Bs    */ Bs,
-          /* M     */ m_size,
-          /* N     */ n_size,
-          /* K     */ IC,
-          /* lda   */ IC,
-          /* ldb   */ n_size,
-          /* ldc   */ BLOCK_N);
+      if (use_brgemm) {
+        at::native::cpublas::brgemm(
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N,
+            /* add_C */ false,
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C32);
+
+        // apply scales
+        const int32_t* Bcomp = reinterpret_cast<const int32_t*>(B + block_size_n() * IC);
+        scale_C<BLOCK_N>(C, C32, As, Bs, Bcomp, m_size);
+      } else {
+        // 2.a gemm: C = A @ B
+        tinygemm_kernel<scalar_t>(
+            /* A     */ A,
+            /* B     */ B,
+            /* C     */ C,
+            /* As    */ As,
+            /* Bs    */ Bs,
+            /* M     */ m_size,
+            /* N     */ n_size,
+            /* K     */ IC,
+            /* lda   */ IC,
+            /* ldb   */ n_size,
+            /* ldc   */ BLOCK_N);
+      }
 
       // 2.b copy from C to output and add fused_experts_out
       scalar_t* __restrict__ out = output + mb * BLOCK_M * K + nb * BLOCK_N;
-      const scalar_t* __restrict__ fused_out = fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N;
+      const scalar_t* __restrict__ fused_out =
+          apply_scaling_factor ? fused_experts_out + mb * BLOCK_M * K + nb * BLOCK_N : nullptr;
       for (int64_t m = 0; m < m_size; ++m) {
-        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out + m * K, routed_scaling_factor, n_size);
+        const scalar_t* __restrict__ fused_out_row = apply_scaling_factor ? (fused_out + m * K) : nullptr;
+        add_mul_stub(out + m * K, C + m * BLOCK_N, fused_out_row, routed_scaling_factor, n_size);
       }
+    });
+
+    if (use_brgemm) {
+      at::native::cpublas::brgemm_release();
     }
   });
 }
 
-#define INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(TYPE)                                        \
-  template void shared_expert_int8_kernel_impl<TYPE> (                                       \
-      TYPE* __restrict__ output, TYPE* __restrict__ ic1,                                     \
-      float* __restrict__ C_tmp, uint8_t* __restrict__ Aq_tmp,                               \
-      float* __restrict__ As_tmp, const TYPE* __restrict__ input,                            \
-      const int8_t* __restrict__ packed_w1, const int8_t* __restrict__ packed_w2,            \
-      const float* __restrict__ w1s, const float* __restrict__ w2s,                          \
-      const TYPE* __restrict__ fused_experts_out, float routed_scaling_factor,               \
-      int64_t M, int64_t N, int64_t K)
+#define INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(TYPE) \
+  template void shared_expert_int8_kernel_impl<TYPE>( \
+      TYPE* __restrict__ output,                      \
+      TYPE* __restrict__ ic1,                         \
+      float* __restrict__ C_tmp,                      \
+      uint8_t* __restrict__ Aq_tmp,                   \
+      float* __restrict__ As_tmp,                     \
+      const TYPE* __restrict__ input,                 \
+      const int8_t* __restrict__ packed_w1,           \
+      const int8_t* __restrict__ packed_w2,           \
+      const float* __restrict__ w1s,                  \
+      const float* __restrict__ w2s,                  \
+      const TYPE* __restrict__ fused_experts_out,     \
+      float routed_scaling_factor,                    \
+      int64_t M,                                      \
+      int64_t N,                                      \
+      int64_t K)
 
 INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::BFloat16);
 INSTANTIATE_SHARED_EXPERT_INT8_TEMPLATE(at::Half);
diff --git a/csrc/cpu/sgl-kernels/vec.h b/csrc/cpu/sgl-kernels/vec.h
index 160845c9b1cb..c045a8cc7dca 100644
--- a/csrc/cpu/sgl-kernels/vec.h
+++ b/csrc/cpu/sgl-kernels/vec.h
@@ -1,10 +1,10 @@
 // Adapted from
 // https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
 
-#pragma once
-
 // clang-format off
 
+#pragma once
+
 #if defined(__AVX512F__) && defined(__AVX512BF16__) && defined(__AMX_BF16__)
 #define CPU_CAPABILITY_AVX512
 #endif
@@ -16,38 +16,51 @@ namespace {
 
 using namespace at::vec;
 
-template <typename scalar_t,
-          typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
+template <typename scalar_t, typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 0>
 inline Vectorized<scalar_t> convert_from_float_ext(const Vectorized<float>& a, const Vectorized<float>& b) {
   return at::vec::convert_from_float<scalar_t>(a, b);
 }
 
+// allow f16, bf16
+template <typename scalar_t, typename std::enable_if_t<is_reduced_floating_point_v<scalar_t>, int> = 1>
+inline std::tuple<Vectorized<float>, Vectorized<float>> load_float_vec2(const scalar_t* __restrict__ data) {
+  using bVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<float>;
+  bVec x_vec = bVec::loadu(data);
+  fVec x0, x1;
+  std::tie(x0, x1) = at::vec::convert_to_float(x_vec);
+  return std::make_tuple(x0, x1);
+}
+
+// allow  f32
+inline std::tuple<Vectorized<float>, Vectorized<float>> load_float_vec2(const float* __restrict__ data) {
+  using fVec = at::vec::Vectorized<float>;
+  fVec x0 = fVec::loadu(data);
+  fVec x1 = fVec::loadu(data + fVec::size());
+  return std::make_tuple(x0, x1);
+}
+
 #if defined(CPU_CAPABILITY_AVX512)
 
 // `at::vec::convert_from_float<>` from PyTorch doesn't have avx512-bf16 intrinsics
 // use native instruction for bfloat16->float32 conversion
 template <>
-inline Vectorized<at::BFloat16> convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
+inline Vectorized<at::BFloat16>
+convert_from_float_ext<at::BFloat16>(const Vectorized<float>& a, const Vectorized<float>& b) {
   return (__m512i)(_mm512_cvtne2ps_pbh(__m512(b), __m512(a)));
 }
 
-#define CVT_BF16_TO_FP32(a) \
-    _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16))
+#define CVT_BF16_TO_FP32(a) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16))
 
-#define CVT_FP16_TO_FP32(a) \
-    _mm512_cvtps_ph(a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
+#define CVT_FP16_TO_FP32(a) _mm512_cvtph_ps(a)
 
 // this doesn't handle NaN.
 inline __m512bh cvt_e4m3_bf16_intrinsic_no_nan(__m256i fp8_vec) {
   const __m512i x = _mm512_cvtepu8_epi16(fp8_vec);
-
-  const __m512i mant = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x07)), 4);
-  const __m512i raw_exp = _mm512_srli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x78)), 3);
-  const __m512i exp = _mm512_slli_epi16(_mm512_add_epi16(raw_exp, _mm512_set1_epi16(120)), 7);
-  const __m512i nonsign = _mm512_or_si512(exp, mant);
-
-  const __m512i sign = _mm512_slli_epi16(_mm512_and_si512(x, _mm512_set1_epi16(0x80)), 8);
-  const __m512i combined = _mm512_or_si512(nonsign, sign);
+  __m512i combined = _mm512_add_epi16(x, _mm512_set1_epi16(0x0780));
+  combined = _mm512_slli_epi16(combined, 4);
+  combined = _mm512_and_si512(combined, _mm512_set1_epi16(0x87f0));
+  combined = _mm512_add_epi16(combined, _mm512_set1_epi16(0x3c00));
 
   const __mmask32 is_nonzero = _mm512_cmpneq_epi16_mask(x, _mm512_setzero_si512());
   return (__m512bh)_mm512_maskz_mov_epi16(is_nonzero, combined);
@@ -113,10 +126,77 @@ inline __m512bh CVT_FP8_TO_BF16(__m256i a) {
 #endif
 }
 
+// faster version of float8_e4m3fn conversion to bfloat16
+//
+// we mapped cuda implementation from below link and vectorized with avx512:
+// https://github.com/thu-pacman/chitu/blob/1ed2078ec26581ebdca05b7306d4385f86edaa7c/csrc/cuda/marlin/marlin_gemm/dequant.h#L387
+//
+inline __attribute__((always_inline)) __m512bh CVT_FP8_TO_BF16_EXT(__m256i a) {
+  const __m512i mask0 = _mm512_set1_epi16(0x80);  // sign bit
+  const __m512i mask1 = _mm512_set1_epi16(0x7F);  // exponent and mantissa
+  const __m512i mask2 = _mm512_set1_epi16(0x4000);
+
+  __m512i x = _mm512_cvtepu8_epi16(a);
+  __m512i vsign = _mm512_and_si512(x, mask0);
+  vsign = _mm512_slli_epi16(vsign, 8);
+
+  __m512i vexp_and_mant = _mm512_and_si512(x, mask1);
+  vexp_and_mant = _mm512_slli_epi16(vexp_and_mant, 4);
+
+  // _MM_TERNLOG_A | _MM_TERNLOG_B | _MM_TERNLOG_C: 0b11111110
+  return (__m512bh)(_mm512_ternarylogic_epi32(vsign, mask2, vexp_and_mant, 0b11111110));
+}
+
+// bias for conversion of fp8 to bf16 1/256 in float32
+#define kFP8_BIAS 0x3b800000
+
+// remove warning: ignoring attributes on template argument ‘__m512bh’ [-Wignored-attributes]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+
+#define MXFP4_VALUES \
+  -6.0f, -4.0f, -3.0f, -2.0f, -1.5f, -1.0f, -0.5f, -0.0f, 6.0f, 4.0f, 3.0f, 2.0f, 1.5f, 1.0f, 0.5f, 0.0f
+
+// convert 64 mxfp4 to 2x bf16 vectors, expect input 32-way packing
+inline std::tuple<__m512bh, __m512bh> cvt_mxfp4_e2m1_bf16_intrinsic_lut(__m256i a, __m512i s0, __m512i s1) {
+  // LUT
+  const __m512 values = _mm512_set_ps(MXFP4_VALUES);
+  const __m512i lut = (__m512i)(_mm512_cvtne2ps_pbh(values, values));
+
+  const __m512i abs_mask = _mm512_set1_epi16(0x7FFF);
+  const __m512i zero = _mm512_setzero_si512();
+
+  // expand values to 16-bit integers
+  __m512i x0 = _mm512_cvtepu8_epi16(a);
+  __m512i x1 = _mm512_srli_epi32(x0, 4);
+
+  // LUT to convert mxfp4 values to bf16
+  x0 = _mm512_permutexvar_epi16(x0, lut);
+  x1 = _mm512_permutexvar_epi16(x1, lut);
+
+  // check for zeros
+  __mmask32 mask0 = _mm512_cmp_epi16_mask(_mm512_and_si512(x0, abs_mask), zero, _MM_CMPINT_EQ);
+  __mmask32 mask1 = _mm512_cmp_epi16_mask(_mm512_and_si512(x1, abs_mask), zero, _MM_CMPINT_EQ);
+
+  // emulate bf16 mul with scale factor
+  x0 = _mm512_add_epi16(x0, s0);
+  x1 = _mm512_add_epi16(x1, s1);
+
+  // blend with zero
+  x0 = _mm512_mask_blend_epi16(mask0, x0, zero);
+  x1 = _mm512_mask_blend_epi16(mask1, x1, zero);
+
+  return std::make_tuple(__m512bh(x0), __m512bh(x1));
+}
+
+#define CVT_MXFP4_TO_BF16(a, s0, s1) cvt_mxfp4_e2m1_bf16_intrinsic_lut(a, s0, s1)
+
+#pragma GCC diagnostic pop
+
 #endif
 
 // vector to scalar reduction
-#if defined(CPU_CAPABILITY_AVX512) && 0
+#if defined(CPU_CAPABILITY_AVX512)
 inline float vec_reduce_sum(const Vectorized<float>& a) {
   return _mm512_reduce_add_ps(__m512(a));
 }
@@ -136,10 +216,9 @@ inline float vec_reduce_max(const Vectorized<float>& a) {
 
 // https://github.com/InternLM/lmdeploy/blob/086481ed84b59bee3b8e4274e5fc69620040c048/lmdeploy/pytorch/kernels/cuda/w8a8_triton_kernels.py#L282
 template <typename scalar_t>
-inline void quantize_row_int8(uint8_t* __restrict__ Aq, float& As,
-    const scalar_t* __restrict__ A, int64_t K, float eps = 1e-7) {
-
-  float amax = 0.f; // absolute max
+inline void
+quantize_row_int8(uint8_t* __restrict__ Aq, float& As, const scalar_t* __restrict__ A, int64_t K, float eps = 1e-7) {
+  float amax = 0.f;  // absolute max
   for (int64_t k = 0; k < K; ++k) {
     const float val = static_cast<float>(A[k]);
     amax = std::max(amax, std::abs(val));
@@ -158,9 +237,8 @@ inline void quantize_row_int8(uint8_t* __restrict__ Aq, float& As,
 
 #if defined(CPU_CAPABILITY_AVX512)
 template <>
-inline void quantize_row_int8<at::BFloat16>(uint8_t* __restrict__ Aq, float& As,
-    const at::BFloat16* __restrict__ A, int64_t K, float eps) {
-
+inline void quantize_row_int8<at::BFloat16>(
+    uint8_t* __restrict__ Aq, float& As, const at::BFloat16* __restrict__ A, int64_t K, float eps) {
   const __m512 signBit = _mm512_set1_ps(-0.0f);
   const __m512i off = _mm512_set1_epi32(128);
 
@@ -200,7 +278,7 @@ inline void quantize_row_int8<at::BFloat16>(uint8_t* __restrict__ Aq, float& As,
 // transpose utils
 // taken from my PR in ggml: https://github.com/ggml-org/llama.cpp/pull/8998
 #if defined(CPU_CAPABILITY_AVX512)
-inline void transpose_16x16_32bit(__m512i * v) {
+inline void transpose_16x16_32bit(__m512i* v) {
   __m512i v1[16];
   v1[0] = _mm512_unpacklo_epi32(v[0], v[1]);
   v1[1] = _mm512_unpackhi_epi32(v[0], v[1]);
@@ -293,16 +371,56 @@ inline std::tuple<__m512i, __m512i> transpose_2x32_16bit(__m512i r0, __m512i r1)
 }
 #pragma GCC diagnostic pop
 
-#endif
-
-// TODO: debug print, remove me later
-template<typename scalar_t>
-void print_array(scalar_t* ptr, int size) {
-  for (int d = 0; d < size; ++d) {
-    if (d % 16 == 0) { std::cout << std::endl; }
-    std::cout << ptr[d] << " ";
-  }
-  std::cout << std::endl;
+inline __attribute__((always_inline)) __m512 _mm512_fexp_u20_ps(const __m512 values) {
+  const __m512 vec_c0 = _mm512_set1_ps(0.00010703434948458272f);
+  const __m512 vec_c1 = _mm512_set1_ps(0.30354260500649682f);
+  const __m512 vec_c2 = _mm512_set1_ps(-0.22433836478672356);
+  const __m512 vec_c3 = _mm512_set1_ps(-0.079204240219773236);
+
+  const __m512 vec_exp_log2ef = _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b));  // log2(e)
+
+  const __m512 vec_a = _mm512_set1_ps(std::pow(2, 23) / std::log2(2));
+  const __m512 vec_b = _mm512_set1_ps(std::pow(2, 23) * 127.f);
+
+  const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+  const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+  __m512i vec_infinity = _mm512_set1_epi32(0x7F800000);
+  __m512i vec_zero = _mm512_setzero_epi32();
+
+  // Fast Exponential Computation on SIMD Architectures
+  // A. Cristiano I. Malossi, Yves Ineichen, Costas Bekas, and Alessandro
+  // Curioni exp(x) = 2**(x * log2(e))
+  //        = 2**xi * 2**xf   - TIPS we are using  the EEEE floating point
+  //        representation with identification to the exponent and the
+  //        mentissa
+  //  2**xf will be approximated to a polynomial of degree 3 computed with
+  //  Horner method
+  // mask for the boundary condition
+  auto min_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_min, _CMP_LT_OS);
+  auto max_mask = _mm512_cmp_ps_mask(values, vec_ln_flt_max, _CMP_GT_OS);
+
+  // transformation with log2(e)
+  auto vec_src = _mm512_mul_ps(values, vec_exp_log2ef);
+  auto vec_fractional = _mm512_sub_ps(vec_src, _mm512_floor_ps(vec_src));
+
+  // compute polynomial using Horner Scheme, for superscalar processor
+  auto vec_res = _mm512_fmadd_ps(vec_fractional, vec_c3, vec_c2);
+  vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c1);
+  vec_res = _mm512_fmadd_ps(vec_fractional, vec_res, vec_c0);
+
+  vec_src = _mm512_sub_ps(vec_src, vec_res);
+  // the tips is here, headache in perspective
+  auto tmp = _mm512_fmadd_ps(vec_a, vec_src, vec_b);
+  // headache bis - we loose precision with the cast but it "fits", but ok
+  // after f32 -> f16 later
+  __m512i casted_integer = _mm512_cvttps_epi32(tmp);
+  // boundary condition, lower than the min -> 0
+  casted_integer = _mm512_mask_mov_epi32(casted_integer, min_mask, vec_zero);
+  // boundary condition, larger than the max -> +oo
+  casted_integer = _mm512_mask_mov_epi32(casted_integer, max_mask, vec_infinity);
+  // final interpretation to float
+  return _mm512_castsi512_ps(casted_integer);
 }
+#endif
 
-} // anonymous namespace
+}  // anonymous namespace
diff --git a/csrc/cpu/sgl-kernels/vec_pack.h b/csrc/cpu/sgl-kernels/vec_pack.h
new file mode 100644
index 000000000000..d245d2a40743
--- /dev/null
+++ b/csrc/cpu/sgl-kernels/vec_pack.h
@@ -0,0 +1,299 @@
+// Adapted from
+// https://github.com/sgl-project/sglang/tree/main/sgl-kernel/csrc/cpu
+
+// clang-format off
+
+// To use the transpose functions
+#include <ATen/native/cpu/utils.h>
+
+#include "vec.h"
+
+namespace {
+
+using namespace at::vec;
+
+template <typename index_t>
+inline index_t get_index(index_t* ind, int i) {
+  return (ind == nullptr) ? (index_t)i : ind[i];
+}
+
+#if defined(CPU_CAPABILITY_AVX512)
+// key: from [N, 32] to [32/2, N, 2]
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_Nx32(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[16];
+
+  int n = 0;
+  for (; n < N; ++n) {
+    index_t index = get_index(ind, n);
+    vinputs[n] = _mm512_loadu_si512(src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; n < 16; ++n) {
+    vinputs[n] = _mm512_set1_epi32(0);
+  }
+
+  // pack key
+  transpose_16x16_32bit(vinputs);
+
+  const __mmask16 vmask = (1 << N) - 1;
+  for (int k = 0; k < 16; ++k) {
+    _mm512_mask_storeu_epi32(dst + k * ld_dst * 2, vmask, vinputs[k]);
+  }
+}
+
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_N_remainder(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int K,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[16];
+
+  int K2 = K >> 1;
+  const __mmask16 vmask = (1 << K2) - 1;
+
+  int n = 0;
+  for (; n < N; ++n) {
+    index_t index = get_index(ind, n);
+    vinputs[n] = _mm512_maskz_loadu_epi32(vmask, src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; n < 16; ++n) {
+    vinputs[n] = _mm512_set1_epi32(0);
+  }
+
+  // pack key
+  transpose_16x16_32bit(vinputs);
+
+  const __mmask16 vmask2 = (1 << N) - 1;
+  for (int k = 0; k < K2; ++k) {
+    _mm512_mask_storeu_epi32(dst + k * ld_dst * 2, vmask2, vinputs[k]);
+  }
+}
+
+// value: from [K, 32] to [K/2, 32, 2]
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_Kx32(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[2];
+
+  int k = 0;
+  for (; k < K; ++k) {
+    index_t index = get_index(ind, k);
+    vinputs[k] = _mm512_loadu_si512(src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; k < 2; ++k) {
+    vinputs[k] = _mm512_set1_epi32(0);
+  }
+
+  // pack value
+  __m512i d0, d1;
+  std::tie(d0, d1) = transpose_2x32_16bit(vinputs[0], vinputs[1]);
+  _mm512_storeu_si512(dst + 0 * ld_dst * 2, d0);
+  _mm512_storeu_si512(dst + 0 * ld_dst * 2 + 32, d1);
+}
+
+template <typename scalar_t, typename index_t>
+inline void pack_vnni_K_remainder(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int N,
+    int ld_src,
+    int ld_dst) {
+  __m512i vinputs[2];
+
+  const __mmask32 vmask = (1 << N) - 1;
+
+  int k = 0;
+  for (; k < K; ++k) {
+    index_t index = get_index(ind, k);
+    vinputs[k] = _mm512_maskz_loadu_epi16(vmask, src + index * ld_src);
+  }
+  // padding with zero to avoid uninitialized vectors
+  for (; k < 2; ++k) {
+    vinputs[k] = _mm512_set1_epi32(0);
+  }
+
+  // pack value
+  __m512i d0, d1;
+  std::tie(d0, d1) = transpose_2x32_16bit(vinputs[0], vinputs[1]);
+
+  if (N <= 16) {
+    // 2N * 16bits: N * 32bits
+    const __mmask16 vmask2 = (1 << N) - 1;
+    _mm512_mask_storeu_epi32(dst + 0 * ld_dst * 2, vmask2, d0);
+  } else {
+    // 2(N-16) * 16bits: (N-16) * 32bits
+    const __mmask16 vmask2 = (1 << (N - 16)) - 1;
+    _mm512_storeu_epi32(dst + 0 * ld_dst * 2, d0);
+    _mm512_mask_storeu_epi32(dst + 0 * ld_dst * 2 + 32, vmask2, d1);
+  }
+}
+#endif
+
+// convert to vnni format
+// from [N, K/2, 2] to [K/2, N, 2] for bfloat16 and float16
+template <typename scalar_t, typename index_t, bool is_indexed>
+void pack_vnni(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int K,
+    int ld_src,
+    int ld_dst) {
+#if defined(CPU_CAPABILITY_AVX512)
+  const int NB = div_up(N, 16);
+  const int KB = K / 32;
+  const int K_remainder = K - KB * 32;
+
+  for (int nb = 0; nb < NB; ++nb) {
+    int nb_size = std::min(N - nb * 16, 16);
+    for (int kb = 0; kb < KB; ++kb) {
+      // handle 16x512bits each block
+      pack_vnni_Nx32<scalar_t, index_t>(
+          /*    dst */ dst + ((kb * 32) >> 1) * ld_dst * 2 + nb * 16 * 2,
+          /*    src */ src + kb * 32 + (is_indexed ? 0 : nb * 16 * ld_src),
+          /*    ind */ is_indexed ? ind + nb * 16 : nullptr,
+          /*      N */ nb_size,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+    if (K_remainder > 0) {
+      pack_vnni_N_remainder<scalar_t, index_t>(
+          /*    dst */ dst + ((KB * 32) >> 1) * ld_dst * 2 + nb * 16 * 2,
+          /*    src */ src + KB * 32 + (is_indexed ? 0 : nb * 16 * ld_src),
+          /*    ind */ is_indexed ? ind + nb * 16 : nullptr,
+          /*      N */ nb_size,
+          /*      K */ K_remainder,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+  }
+#else
+  for (int n = 0; n < N; ++n) {
+    index_t index = get_index(ind, n);
+    for (int k = 0; k < K / 2; ++k) {
+      for (int d = 0; d < 2; ++d) {
+        dst[k * ld_dst * 2 + n * 2 + d] = src[index * ld_src + k * 2 + d];
+      }
+    }
+  }
+#endif
+}
+
+template <typename scalar_t>
+void pack_vnni(scalar_t* __restrict__ dst, const scalar_t* __restrict__ src, int N, int K, int ld_src, int ld_dst) {
+  pack_vnni<scalar_t, int32_t, false>(dst, src, nullptr, N, K, ld_src, ld_dst);
+}
+
+template <typename scalar_t, typename index_t>
+void pack_vnni(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int N,
+    int K,
+    int ld_src,
+    int ld_dst) {
+  assert(ind != nullptr);
+  pack_vnni<scalar_t, index_t, true>(dst, src, ind, N, K, ld_src, ld_dst);
+}
+
+// convert to vnni format
+// from [K/2, 2, N] to [K/2, N, 2] for bfloat16 and float16
+template <typename scalar_t, typename index_t, bool is_indexed>
+void pack_vnni2(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int N,
+    int ld_src,
+    int ld_dst) {
+#if defined(CPU_CAPABILITY_AVX512)
+  const int KB = div_up(K, 2);
+  const int NB = N / 32;
+  const int N_remainder = N - NB * 32;
+
+  for (int kb = 0; kb < KB; ++kb) {
+    int kb_size = std::min(K - kb * 2, 2);
+    for (int nb = 0; nb < NB; ++nb) {
+      // handle 2x512bits each block
+      pack_vnni_Kx32<scalar_t, index_t>(
+          /*    dst */ dst + ((kb * 2) >> 1) * ld_dst * 2 + nb * 32 * 2,
+          /*    src */ src + (is_indexed ? 0 : kb * 2 * ld_src) + nb * 32,
+          /*    ind */ is_indexed ? ind + kb * 2 : nullptr,
+          /*      K */ kb_size,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+    if (N_remainder > 0) {
+      pack_vnni_K_remainder(
+          /*    dst */ dst + ((kb * 2) >> 1) * ld_dst * 2 + NB * 32 * 2,
+          /*    src */ src + (is_indexed ? 0 : kb * 2 * ld_src) + NB * 32,
+          /*    ind */ is_indexed ? ind + kb * 2 : nullptr,
+          /*      K */ kb_size,
+          /*      N */ N_remainder,
+          /* ld_src */ ld_src,
+          /* ld_dst */ ld_dst);
+    }
+  }
+#else
+  int k = 0;
+  for (; k < (K >> 1) * 2; k += 2) {
+    index_t index0 = get_index(ind, k + 0);
+    index_t index1 = get_index(ind, k + 1);
+    for (int n = 0; n < N; ++n) {
+      dst[(k >> 1) * ld_dst * 2 + n * 2 + 0] = src[index0 * ld_src + n];
+      dst[(k >> 1) * ld_dst * 2 + n * 2 + 1] = src[index1 * ld_src + n];
+    }
+  }
+  if (K % 2 != 0) {
+    index_t index = get_index(ind, K - 1);
+    for (int n = 0; n < N; ++n) {
+      dst[(K >> 1) * ld_dst * 2 + n * 2 + 0] = src[index * ld_src + n];
+      dst[(K >> 1) * ld_dst * 2 + n * 2 + 1] = 0;
+    }
+    k += 2;
+  }
+#endif
+}
+
+template <typename scalar_t>
+void pack_vnni2(scalar_t* __restrict__ dst, const scalar_t* __restrict__ src, int K, int N, int ld_src, int ld_dst) {
+  pack_vnni2<scalar_t, int32_t, false>(dst, src, nullptr, K, N, ld_src, ld_dst);
+}
+
+template <typename scalar_t, typename index_t>
+void pack_vnni2(
+    scalar_t* __restrict__ dst,
+    const scalar_t* __restrict__ src,
+    const index_t* __restrict__ ind,
+    int K,
+    int N,
+    int ld_src,
+    int ld_dst) {
+  assert(ind != nullptr);
+  pack_vnni2<scalar_t, index_t, true>(dst, src, ind, K, N, ld_src, ld_dst);
+}
+
+}  // anonymous namespace
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 89c5765ee646..9d3c354558e9 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -56,6 +56,8 @@ void shm_send_tensor_list(int64_t handle,
 
 std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
 
+// SGL CPU kernels
+
 at::Tensor weight_packed_linear(at::Tensor& mat1, at::Tensor& mat2,
                                 const std::optional<at::Tensor>& bias,
                                 bool is_vnni);
@@ -65,21 +67,32 @@ at::Tensor convert_weight_packed(at::Tensor& weight);
 at::Tensor fused_experts_cpu(
     at::Tensor& hidden_states, at::Tensor& w1, at::Tensor& w2,
     at::Tensor& topk_weights, at::Tensor& topk_ids, bool inplace,
-    bool use_int8_w8a8, bool use_fp8_w8a16,
-    const std::optional<at::Tensor>& w1_scale,
+    int64_t moe_comp_method, const std::optional<at::Tensor>& w1_scale,
     const std::optional<at::Tensor>& w2_scale,
-    const std::optional<std::vector<int64_t>> block_size,
-    const std::optional<at::Tensor>& a1_scale,
-    const std::optional<at::Tensor>& a2_scale, bool is_vnni);
+    const std::optional<at::Tensor>& w1_zero,
+    const std::optional<at::Tensor>& w2_zero,
+    const std::optional<std::vector<int64_t>> block_size, bool is_vnni);
 
 at::Tensor int8_scaled_mm_with_quant(at::Tensor& mat1, at::Tensor& mat2,
                                      at::Tensor& scales2,
                                      const std::optional<at::Tensor>& bias,
                                      at::ScalarType out_dtype, bool is_vnni);
 
+// Adapted from sglang: FP8 W8A16 kernel
+at::Tensor fp8_scaled_mm_cpu(at::Tensor& mat1, at::Tensor& mat2,
+                             at::Tensor& scales2,
+                             std::vector<int64_t> block_size,
+                             const std::optional<at::Tensor>& bias,
+                             at::ScalarType out_dtype, bool is_vnni);
+
 // Adapted from sglang: INT4 W4A8 kernels
 std::tuple<at::Tensor, at::Tensor, at::Tensor> convert_weight_packed_scale_zp(
-    at::Tensor qweight, at::Tensor qzeros, at::Tensor scales);
+    at::Tensor qweight,  // awq: (*, K, N / 8)  ||  gptq: (*, K / 8, N) , int32
+    at::Tensor qzeros,   // awq: (*, K / group_size, N / 8) ||  gptq: (*, K /
+                         // group_size, N / 8) , int32
+    at::Tensor scales,   // awq: (*, K / group_size, N) ||  gptq: (*, K /
+                         // group_size, N) , bfloat16
+    int64_t quant_method_4bit);
 
 at::Tensor int4_scaled_mm_cpu(at::Tensor& x, at::Tensor& w, at::Tensor& w_zeros,
                               at::Tensor& w_scales,
@@ -271,8 +284,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
 
   // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
-    defined(__powerpc64__)
+#if defined(__AVX512F__) || defined(__AVX2__) || \
+    (defined(__aarch64__) && !defined(__APPLE__)) || defined(__powerpc64__)
   // Helper function to release oneDNN handlers
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
@@ -353,10 +366,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("convert_weight_packed(Tensor! weight) -> Tensor");
   ops.impl("convert_weight_packed", torch::kCPU, &convert_weight_packed);
   ops.def(
-      "fused_experts_cpu(Tensor! hidden_states, Tensor w1, Tensor w2, Tensor "
-      "topk_weights, Tensor topk_ids, bool inplace, bool use_int8_w8a8, bool "
-      "use_fp8_w8a16, Tensor? w1_scale, Tensor? w2_scale, SymInt[]? "
-      "block_size, Tensor? a1_scale, Tensor? a2_scale, bool is_vnni) -> "
+      "fused_experts_cpu(Tensor hidden_states, Tensor w1, Tensor w2, Tensor "
+      "topk_weights, Tensor topk_ids, bool "
+      "inplace, int moe_comp_method, Tensor? w1_scale, Tensor? w2_scale, "
+      "Tensor? w1_zero, Tensor? w2_zero, int[]? block_size, bool is_vnni) -> "
       "Tensor");
   ops.impl("fused_experts_cpu", torch::kCPU, &fused_experts_cpu);
   ops.def(
@@ -367,8 +380,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Adapted from sglang: INT4 W4A8 kernels
   ops.def(
-      "convert_weight_packed_scale_zp(Tensor qweight, Tensor qzeros, "
-      "Tensor scales) -> (Tensor, Tensor, Tensor)");
+      "convert_weight_packed_scale_zp(Tensor weight, Tensor qzeros, Tensor "
+      "scales, int quant_method_4bit) -> (Tensor, "
+      "Tensor, Tensor)");
   ops.impl("convert_weight_packed_scale_zp", torch::kCPU,
            &convert_weight_packed_scale_zp);
 
@@ -376,6 +390,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int4_scaled_mm_cpu(Tensor(a0!) x, Tensor(a1!) w, Tensor(a2!) w_zeros, "
       "Tensor(a3!) w_scales, Tensor? bias) -> Tensor");
   ops.impl("int4_scaled_mm_cpu", torch::kCPU, &int4_scaled_mm_cpu);
+
+  // Adapted from sglang: FP8 W8A16 kernel
+  ops.def(
+      "fp8_scaled_mm_cpu(Tensor(a0!) mat1, Tensor(a1!) mat2, Tensor(a2!) "
+      "scales2, SymInt[] block_size, Tensor? bias, ScalarType out_dtype, "
+      "bool is_vnni) -> Tensor");
+  ops.impl("fp8_scaled_mm_cpu", torch::kCPU, &fp8_scaled_mm_cpu);
 #endif
 
   // CPU attention kernels
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
index 2c9e01c60f93..394e67e3a034 100644
--- a/csrc/cpu/utils.hpp
+++ b/csrc/cpu/utils.hpp
@@ -54,7 +54,7 @@ struct Counter {
 };
 
 inline int64_t get_available_l2_size() {
-#if defined(__s390x__)
+#if defined(__s390x__) || defined(__powerpc__)
   static int64_t size = []() {
     uint32_t l2_cache_size = 0;
     auto caps = at::cpu::get_cpu_capabilities();
diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index 9ef623bf7f1f..0b720d356e78 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -232,28 +232,6 @@ void unmap_and_release(unsigned long long device, ssize_t size,
     }
   }
 
-  // ROCm workaround: hipMemRelease does not return physical VRAM to the
-  // free pool while the virtual-address reservation is still held.
-  // Cycling cuMemAddressFree → cuMemAddressReserve (at the same address)
-  // forces the driver to actually release the physical pages while keeping
-  // the same VA available for a later create_and_map.
-  if (first_error == no_error) {
-    first_error = cuMemAddressFree(d_mem, size);
-    if (first_error == no_error) {
-      CUdeviceptr d_mem_new = 0;
-      first_error = cuMemAddressReserve(&d_mem_new, size, 0, d_mem, 0);
-      if (first_error == no_error && d_mem_new != d_mem) {
-        cuMemAddressFree(d_mem_new, size);
-        snprintf(error_msg, sizeof(error_msg),
-                 "ROCm: VA re-reserve got %p instead of %p", (void*)d_mem_new,
-                 (void*)d_mem);
-        error_code = CUresult(1);
-        std::cerr << error_msg << std::endl;
-        return;
-      }
-    }
-  }
-
   if (first_error != no_error) {
     CUDA_CHECK(first_error);
   }
diff --git a/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu b/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
index e96017d86dad..2f2e7ecc1829 100644
--- a/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
+++ b/csrc/fused_deepseek_v4_qnorm_rope_kv_insert_kernel.cu
@@ -29,7 +29,11 @@
  */
 
 #include <cmath>
-#include <cuda_fp8.h>
+#ifndef USE_ROCM
+  #include <cuda_fp8.h>
+#else
+  #include <hip/hip_fp8.h>
+#endif
 #include <cuda_runtime.h>
 #include <type_traits>
 
@@ -42,7 +46,23 @@
 #include "type_convert.cuh"
 
 #ifndef FINAL_MASK
-  #define FINAL_MASK 0xffffffffu
+  #ifdef USE_ROCM
+    #define FINAL_MASK 0xffffffffffffffffULL
+  #else
+    #define FINAL_MASK 0xffffffffu
+  #endif
+#endif
+
+#ifdef USE_ROCM
+// ROCm-compatible FP8 conversion helpers
+__device__ __forceinline__ uint8_t rocm_cvt_float_to_fp8_e4m3(float val) {
+  #if defined(HIP_FP8_TYPE_OCP)
+  __hip_fp8_e4m3 fp8_val(val);
+  #else
+  __hip_fp8_e4m3_fnuz fp8_val(val);
+  #endif
+  return reinterpret_cast<uint8_t&>(fp8_val);
+}
 #endif
 
 namespace vllm {
@@ -314,9 +334,13 @@ __global__ void fusedDeepseekV4QNormRopeKVRopeQuantInsertKernel(
       for (int i = 0; i < kElemsPerLane; i++) {
         float scaled = elements[i] * inv_scale;
         scaled = fminf(fmaxf(scaled, -kFp8Max), kFp8Max);
+#ifndef USE_ROCM
         __nv_fp8_storage_t s =
             __nv_cvt_float_to_fp8(scaled, __NV_SATFINITE, __NV_E4M3);
         out_bytes[i] = static_cast<uint8_t>(s);
+#else
+      out_bytes[i] = rocm_cvt_float_to_fp8_e4m3(scaled);
+#endif
       }
       // One 16-byte STG per lane.
       *reinterpret_cast<uint4*>(token_fp8_ptr + dim_base) =
@@ -384,6 +408,7 @@ void launchFusedDeepseekV4QNormRopeKVRopeQuantInsert(
   // PDL: enable programmatic stream serialization whenever the hardware
   // supports it (SM90+).  On pre-Hopper GPUs the attribute is unavailable,
   // so leave numAttrs = 0 and launch as a regular kernel.
+#ifndef USE_ROCM
   static int const sm_version = getSMVersion();
   // Host-side guard: the device kernel body is compiled as a no-op for
   // bf16 on pre-Ampere (sm_70/sm_75) because _typeConvert<BFloat16> is
@@ -410,6 +435,15 @@ void launchFusedDeepseekV4QNormRopeKVRopeQuantInsert(
       q_inout, kv_in, k_cache, slot_mapping, position_ids, cos_sin_cache, eps,
       num_tokens_full, num_tokens_insert, num_heads_q, cache_block_size,
       kv_block_stride);
+#else
+  // ROCm: use standard kernel launch syntax (no PDL/stream serialization)
+  // clang-format off
+  fusedDeepseekV4QNormRopeKVRopeQuantInsertKernel<scalar_t_in>
+      <<<grid, kBlockSize, 0, stream>>>(
+          q_inout, kv_in, k_cache, slot_mapping, position_ids, cos_sin_cache,
+          eps, num_tokens_full, num_tokens_insert, num_heads_q,
+          cache_block_size, kv_block_stride);
+#endif
 }
 
 }  // namespace deepseek_v4_fused_ops
diff --git a/csrc/moe/topk_softplus_sqrt_kernels.cu b/csrc/moe/topk_softplus_sqrt_kernels.cu
index 50a8540a7374..43d461a0179a 100644
--- a/csrc/moe/topk_softplus_sqrt_kernels.cu
+++ b/csrc/moe/topk_softplus_sqrt_kernels.cu
@@ -60,15 +60,6 @@ __device__ __forceinline__ float toFloat(T value) {
   }
 }
 
-#define FINAL_MASK 0xffffffff
-template <typename T>
-__inline__ __device__ T warpReduceSum(T val) {
-#pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1)
-    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
-  return val;
-}
-
 // ====================== TopK softplus_sqrt things
 // ===============================
 
@@ -272,8 +263,14 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
       }
     }
     // Compute per-thread scale (using warp reduction when renormalizing).
+    // THREADS_PER_ROW-parameterized butterfly works for both warp sizes (32
+    // on CUDA, 64 on ROCm CDNA) and any THREADS_PER_ROW the dispatch picks.
     if (renormalize) {
-      selected_sum = warpReduceSum(selected_sum);
+#pragma unroll
+      for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2) {
+        selected_sum +=
+            VLLM_SHFL_XOR_SYNC_WIDTH(selected_sum, mask, THREADS_PER_ROW);
+      }
     }
     float scale = static_cast<float>(routed_scaling_factor);
     if (renormalize) {
@@ -544,7 +541,6 @@ void topkGatingSoftplusSqrtKernelLauncher(
     const IndType* tid2eid, cudaStream_t stream) {
   static constexpr int WARPS_PER_TB = 4;
   static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
-#ifndef USE_ROCM
   // for bfloat16 dtype, we need 4 bytes loading to make sure num_experts
   // elements can be loaded by a warp
   static constexpr int BYTES_PER_LDG_MULTIPLE_64 =
@@ -552,6 +548,19 @@ void topkGatingSoftplusSqrtKernelLauncher(
        std::is_same_v<InputType, __half>)
           ? 4
           : 8;
+  // Narrower LDG (ELTS_PER_LDG=1) used by 192/320/448/576 on ROCm WARP_SIZE=64
+  // where ELTS_PER_LDG=2 fails the EXPERTS%(ELTS_PER_LDG*WARP_SIZE)==0 check.
+  // On CUDA WARP_SIZE=32 the wider LDG already aligns, so the alias collapses
+  // back to BYTES_PER_LDG_MULTIPLE_64 — no behavioral change for CUDA.
+#ifdef USE_ROCM
+  static constexpr int BYTES_PER_LDG_MULTIPLE_64_NARROW =
+      (std::is_same_v<InputType, __nv_bfloat16> ||
+       std::is_same_v<InputType, __half>)
+          ? 2
+          : 4;
+#else
+  static constexpr int BYTES_PER_LDG_MULTIPLE_64_NARROW =
+      BYTES_PER_LDG_MULTIPLE_64;
 #endif
   switch (num_experts) {
     case 1:
@@ -584,27 +593,29 @@ void topkGatingSoftplusSqrtKernelLauncher(
     case 512:
       LAUNCH_SOFTPLUS_SQRT(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
       break;
-      // (CUDA only) support multiples of 64 when num_experts is not power of 2.
-      // ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of
-      // num_experts, alternatively we can test 4 bytes loading and enable it in
-      // future.
-#ifndef USE_ROCM
+      // Multiples of 64 that are not powers of 2. The kernel requires
+      // EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0. With ELTS_PER_LDG=2
+      // (BYTES_PER_LDG_MULTIPLE_64), this holds for all five values on CUDA
+      // WARP_SIZE=32 but only for 384 on ROCm WARP_SIZE=64. The other four
+      // use BYTES_PER_LDG_MULTIPLE_64_NARROW (ELTS_PER_LDG=1), which
+      // satisfies the assertion for any multiple of 64 on either backend;
+      // on CUDA the narrow alias collapses back to the wider load, so CUDA
+      // behavior is unchanged.
     case 192:
-      LAUNCH_SOFTPLUS_SQRT(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 320:
-      LAUNCH_SOFTPLUS_SQRT(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 384:
       LAUNCH_SOFTPLUS_SQRT(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
       break;
     case 448:
-      LAUNCH_SOFTPLUS_SQRT(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
     case 576:
-      LAUNCH_SOFTPLUS_SQRT(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
+      LAUNCH_SOFTPLUS_SQRT(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64_NARROW);
       break;
-#endif
     default: {
       TORCH_CHECK(false, "Unsupported expert number: ", num_experts);
     }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index b737cb54353c..8940e341cd01 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -16,14 +16,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "bias) -> ()");
   m.impl("topk_sigmoid", torch::kCUDA, &topk_sigmoid);
 
-#ifndef USE_ROCM
   m.def(
       "topk_softplus_sqrt(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
       "token_expert_indices, Tensor gating_output, bool renormalize, float "
       "routed_scaling_factor, Tensor? "
       "bias, Tensor? input_ids, Tensor? tid2eid) -> ()");
   m.impl("topk_softplus_sqrt", torch::kCUDA, &topk_softplus_sqrt);
-#endif
+
   // Calculate the result of moe by summing up the partial results
   // from all selected experts.
   m.def("moe_sum(Tensor input, Tensor! output) -> ()");
diff --git a/csrc/topk.cu b/csrc/topk.cu
index 68352629ef02..c5bffb32856d 100644
--- a/csrc/topk.cu
+++ b/csrc/topk.cu
@@ -153,14 +153,23 @@ void launch_persistent_topk(const torch::Tensor& logits,
     TORCH_CHECK(workspace.size(0) >= static_cast<int64_t>(state_bytes),
                 "workspace too small, need ", state_bytes, " bytes");
 
-    // Zero the per-group RadixRowState region before launch — only when the
-    // radix path will actually run (max_seq_len > RADIX_THRESHOLD). The
-    // RadixRowState fields (arrival_counter, histograms) are only touched by
-    // radix_topk; the decode/medium paths inside the persistent kernel
-    // operate purely in shared memory and never read these globals, so a
-    // stale workspace is harmless for them.
+    // Zero the per-group RadixRowState region before launch.
     //
-    // Why we need the memset (when needs_cooperative is true):
+    // Issued UNCONDITIONALLY so the memset is captured as its own node in
+    // the cudagraph (a separate cudaMemsetAsync node, sequenced before the
+    // persistent_topk_kernel launch on the same stream). The previous
+    // host-side guard `if (needs_cooperative)` was evaluated at capture time;
+    // when capture-time max_seq_len <= RADIX_THRESHOLD (always true under
+    // FULL_DECODE_ONLY with max_model_len < 32 K) the memset would NOT be
+    // captured, leaving the workspace state to accumulate across replays.
+    // That's a latent correctness bug if the runtime data ever takes the
+    // radix path, and removes one variable while debugging hangs in the
+    // decode/medium paths.
+    //
+    // Cost is sub-microsecond: state_bytes = num_groups * sizeof(RadixRowState)
+    // is ~3 KB per group, ~100 KB for the largest grids on this hardware.
+    //
+    // Why the memset is required (regardless of which path the kernel takes):
     //   1. arrival_counter accumulates within a launch and is never reset,
     //      so a prior call leaves it at a large positive value. Without this
     //      reset, the very first wait_ge in the next call sees counter >>
@@ -169,7 +178,7 @@ void launch_persistent_topk(const torch::Tensor& logits,
     //      __syncthreads(), so it had no happens-before edge to CTA-1+'s
     //      first red_release. cudaMemsetAsync is stream-ordered: the zero
     //      is globally visible before any CTA runs.
-    if (needs_cooperative) {
+    {
       cudaError_t mz_err = cudaMemsetAsync(workspace.data_ptr<uint8_t>(), 0,
                                            state_bytes, stream);
       TORCH_CHECK(mz_err == cudaSuccess,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 8d8f7bed0441..e695497fd88f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -183,7 +183,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "int forced_token_heads_per_warp=-1) -> ()");
   ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
 
-#ifndef USE_ROCM
   // Horizontally-fused DeepseekV4-MLA: per-head RMSNorm + GPT-J RoPE for Q, and
   // GPT-J RoPE + UE8M0 FP8 quant + paged cache insert for KV, all in one
   // kernel launch.
@@ -194,7 +193,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float eps, int cache_block_size) -> ()");
   ops.impl("fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert", torch::kCUDA,
            &fused_deepseek_v4_qnorm_rope_kv_rope_quant_insert);
-#endif
 
   // Apply repetition penalties to logits in-place
   ops.def(
diff --git a/docker/Dockerfile b/docker/Dockerfile
index a6b291407713..1051ca97d3d3 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -41,6 +41,13 @@ ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu${UBUNTU_VERSION}
 
+# OS family of BUILD_BASE_IMAGE. Controls package manager (apt vs dnf) and
+# Python bootstrap. Set to "manylinux" alongside a manylinux build base such
+# as pytorch/manylinux2_28-builder:cuda13.0 to produce wheels with a glibc
+# 2.28 floor (matches PyTorch's own published wheels). Default stays on
+# Ubuntu for backwards compatibility.
+ARG BUILD_OS=ubuntu
+
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
 # installation of the GPG key of the PPA, as done by add-apt-repository, so we
@@ -94,35 +101,64 @@ FROM ${BUILD_BASE_IMAGE} AS base
 
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
+ARG BUILD_OS
 
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install system dependencies including build tools
-RUN apt-get update -y \
-    && apt-get install -y --no-install-recommends \
-        ccache \
-        software-properties-common \
-        git \
-        curl \
-        sudo \
-        python3-pip \
-        libibverbs-dev \
-        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-        # as it was causing spam when compiling the CUTLASS kernels
-        gcc-10 \
-        g++-10 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
-    # Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
-    # which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
-    # Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
-    && (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
-    && rm -rf /var/lib/apt/lists/* \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+# Install system dependencies including build tools.
+# The Ubuntu path uses apt + deadsnakes-via-uv for Python; the manylinux path
+# (AlmaLinux 8, e.g. pytorch/manylinux2_28-builder) uses dnf and the Python
+# interpreters pre-installed at /opt/python/cpXY-cpXY/.
+RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
+        # rdma-core-devel provides libibverbs headers; ccache lives in EPEL,
+        # which the pytorch manylinux image already enables. git/curl/sudo
+        # are typically pre-installed but listed defensively.
+        dnf install -y --setopt=install_weak_deps=False \
+            ccache \
+            git \
+            curl \
+            sudo \
+            rdma-core-devel \
+        && dnf clean all \
+        && rm -rf /var/cache/dnf; \
+    else \
+        apt-get update -y \
+        && apt-get install -y --no-install-recommends \
+            ccache \
+            software-properties-common \
+            git \
+            curl \
+            sudo \
+            python3-pip \
+            libibverbs-dev \
+            # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+            # as it was causing spam when compiling the CUTLASS kernels
+            gcc-10 \
+            g++-10 \
+        && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+        # Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
+        # which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
+        # Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
+        && (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
+        && rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Install uv and bootstrap /opt/venv. Both paths converge on /opt/venv so all
+# downstream stages stay distro-agnostic.
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && if [ "${BUILD_OS}" = "manylinux" ]; then \
+           # manylinux images ship Python at /opt/python/cpXY-cpXY/; point uv
+           # at the matching interpreter rather than letting it fetch one.
+           PYV_NODOT=$(echo ${PYTHON_VERSION} | tr -d '.') \
+           && MANYLINUX_PY=/opt/python/cp${PYV_NODOT}-cp${PYV_NODOT}/bin/python${PYTHON_VERSION} \
+           && $HOME/.local/bin/uv venv /opt/venv --python "$MANYLINUX_PY"; \
+       else \
+           $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION}; \
+       fi \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
+    && ln -sf /opt/venv/bin/python3 /usr/bin/python3 \
+    && ln -sf /opt/venv/bin/python3-config /usr/bin/python3-config \
+    && ln -sf /opt/venv/bin/pip /usr/bin/pip \
     && python3 --version && python3 -m pip --version
 
 # Activate virtual environment and add uv to PATH
@@ -433,6 +469,7 @@ FROM base AS dev
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
+ARG BUILD_OS
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -442,7 +479,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
+        dnf install -y numactl-devel && dnf clean all && rm -rf /var/cache/dnf; \
+    else \
+        apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*; \
+    fi
 
 
 # We can specify the standard or nightly build of PyTorch
@@ -819,7 +860,7 @@ LABEL org.opencontainers.image.source="https://github.com/vllm-project/vllm" \
 # define sagemaker first, so it is not default from `docker build`
 FROM vllm-openai-base AS vllm-sagemaker
 
-COPY examples/online_serving/sagemaker-entrypoint.sh .
+COPY examples/deployment/sagemaker-entrypoint.sh .
 RUN chmod +x sagemaker-entrypoint.sh
 ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 0ed12f11da94..92ac17bcd7e1 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -507,6 +507,10 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     && pip uninstall -y vllm \
     && uv pip install --system *.whl
 
+# Install RIXL wheel
+RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
+    uv pip install --system /rixl_install/*.whl
+
 ARG COMMON_WORKDIR
 ARG BASE_IMAGE
 ARG NIC_BACKEND
@@ -517,6 +521,10 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
 
+# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc
+# See: https://github.com/ROCm/rocm-libraries/issues/6266
+ENV HSA_ENABLE_IPC_MODE_LEGACY=1
+
 ENV TOKENIZERS_PARALLELISM=false
 
 # ENV that can improve safe tensor loading, and end-to-end time
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 5940a4ee564d..a21916d0b531 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,4 +1,4 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.2.1-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.2.2-complete
 ARG TRITON_BRANCH="ba5c1517"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="8514f051" # release/2.10 as of 3/17
@@ -9,7 +9,7 @@ ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
 ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="v0.1.10.post3"
+ARG AITER_BRANCH="v0.1.12.post2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 ARG MORI_BRANCH="v1.1.0"
 ARG MORI_REPO="https://github.com/ROCm/mori.git"
@@ -104,6 +104,28 @@ ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
 ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
 ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
 
+# torch profiler hotfix for 7.2.2: rebuild CLR with https://github.com/ROCm/rocm-systems/pull/5062
+# will be removed once we move to ROCm 7.2.3
+RUN apt-get update && apt-get install -y rocm-llvm-dev
+RUN pip install CppHeaderParser
+RUN git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems /tmp/rocm-systems \
+    && cd /tmp/rocm-systems \
+    && git sparse-checkout init --cone \
+    && git sparse-checkout set projects/hip projects/clr \
+    && git checkout 35e8c7bf8911862e5389509800e65fdf125412b3 \
+    && export CLR_DIR=/tmp/rocm-systems/projects/clr \
+    && export HIP_DIR=/tmp/rocm-systems/projects/hip \
+    && mkdir -p $CLR_DIR/build && cd $CLR_DIR/build \
+    && cmake \
+        -DHIP_COMMON_DIR=$HIP_DIR \
+        -DCMAKE_PREFIX_PATH="/opt/rocm/" \
+        -DCLR_BUILD_HIP=ON \
+        -DCLR_BUILD_OCL=OFF \
+        -DHIP_PLATFORM=amd \
+        .. \
+    && make -j$(nproc) \
+    && make install \
+    && rm -rf /tmp/rocm-systems
 
 ###
 ### Triton Build
@@ -153,8 +175,6 @@ RUN git clone ${PYTORCH_REPO} pytorch
 RUN cd pytorch && git checkout ${PYTORCH_BRANCH}
 RUN cd pytorch \
     && pip install -r requirements.txt && git submodule update --init --recursive
-RUN cd pytorch/third_party/kineto \
-    && git remote add rocm https://github.com/ROCm/kineto && git fetch rocm && git checkout 2d73be3 
 RUN cd pytorch && python3 tools/amd_build/build_amd.py \
     && if [ "$USE_SCCACHE" = "1" ]; then \
            export HIP_CLANG_PATH=/opt/sccache-wrappers \
diff --git a/docker/versions.json b/docker/versions.json
index b6b555790d2a..75652823db0b 100644
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -16,6 +16,9 @@
     "FINAL_BASE_IMAGE": {
       "default": "nvidia/cuda:13.0.2-base-ubuntu22.04"
     },
+    "BUILD_OS": {
+      "default": "ubuntu"
+    },
     "GET_PIP_URL": {
       "default": "https://bootstrap.pypa.io/get-pip.py"
     },
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 472e1cf57ff9..bbfa4beb0860 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -270,6 +270,36 @@ Known supported models (with corresponding benchmarks):
 
 ## Input Processing
 
+### fastokens Tokenizer Mode
+
+By default vLLM uses the standard Hugging Face `tokenizers` library to power
+the fast tokenizer (`--tokenizer-mode hf`). For BPE tokenizers (Qwen, Llama,
+DeepSeek, GPT-OSS, etc.) you can switch to the
+[fastokens](https://github.com/crusoecloud/fastokens) Rust backend, a drop-in
+replacement that's substantially faster on encode/decode and on streaming
+detokenization:
+
+```console
+vllm serve Qwen/Qwen3-8B --tokenizer-mode fastokens
+```
+
+Equivalent in the offline API:
+
+```python
+from vllm import LLM
+llm = LLM(model="Qwen/Qwen3-8B", tokenizer_mode="fastokens")
+```
+
+The `fastokens` Python package must be installed; if it isn't, vLLM raises
+a clear `ImportError` at tokenizer load. `fastokens` loads a Hugging Face
+fast tokenizer with its inner Rust tokenizer replaced by the fastokens shim,
+so it is mutually exclusive with non-HF modes such as `mistral` or
+`deepseek_v32`.
+
+Tokenizer-bound workloads — long shared prefixes, bursty short prompts,
+batch detokenization — see the largest wins. If your bottleneck is GPU
+prefill/decode, the tokenizer change is unlikely to be visible end-to-end.
+
 ### Parallel Processing
 
 You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
diff --git a/docs/contributing/ci/failures.md b/docs/contributing/ci/failures.md
index dad04e75fbb6..a0038f461a04 100644
--- a/docs/contributing/ci/failures.md
+++ b/docs/contributing/ci/failures.md
@@ -60,9 +60,19 @@ the failure?
 
 ## Logs Wrangling
 
-Download the full log file from Buildkite locally.
+Download a job's log (no Buildkite login required):
 
-Strip timestamps and colorization:
+[.buildkite/scripts/ci-fetch-log.sh](../../../.buildkite/scripts/ci-fetch-log.sh)
+
+```bash
+# Find the failing job. Each row's URL is .../builds/<N>#<job_uuid>:
+gh pr checks <PR> --repo vllm-project/vllm
+
+# Download + strip timestamps/ANSI in one step:
+.buildkite/scripts/ci-fetch-log.sh "https://buildkite.com/vllm/ci/builds/<N>#<job_uuid>"
+```
+
+To clean an already-downloaded log:
 
 [.buildkite/scripts/ci-clean-log.sh](../../../.buildkite/scripts/ci-clean-log.sh)
 
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index 3e2ee38d2bdd..b076ef84a46c 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -278,7 +278,7 @@ Once your model implements `SupportsTranscription`, you can test the endpoints (
       http://localhost:8000/v1/audio/translations
     ```
 
-Or check out more examples in [examples/online_serving](../../../examples/online_serving).
+Or check out more examples in [examples/speech_to_text](../../../examples/speech_to_text).
 
 !!! note
     - If your model handles chunking internally (e.g., via its processor or encoder), set `min_energy_split_window_size=None` in the returned `SpeechToTextConfig` to disable server-side chunking.
diff --git a/docs/deployment/frameworks/anyscale.md b/docs/deployment/frameworks/anyscale.md
index 965742ec0726..6888e4dbf0b6 100644
--- a/docs/deployment/frameworks/anyscale.md
+++ b/docs/deployment/frameworks/anyscale.md
@@ -3,7 +3,7 @@
 [Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
 
 Anyscale automates the entire lifecycle of Ray clusters in your AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
-without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/online_serving/run_cluster.sh](../../../examples/online_serving/run_cluster.sh).
+without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, managing observability stacks, or manually managing head and worker nodes with helper scripts like [examples/ray_serving/run_cluster.sh](../../../examples/ray_serving/run_cluster.sh).
 
 When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
 
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
index a0aee70b1b32..45924dd6f7c3 100644
--- a/docs/deployment/frameworks/helm.md
+++ b/docs/deployment/frameworks/helm.md
@@ -17,7 +17,7 @@ Before you begin, ensure that you have the following:
 
 ## Installing the chart
 
-This guide uses the Helm chart at [examples/online_serving/chart-helm](../../../examples/online_serving/chart-helm).
+This guide uses the Helm chart at [examples/deployment/chart-helm](../../../examples/deployment/chart-helm).
 
 To install the chart with the release name `test-vllm`:
 
diff --git a/docs/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
index 14710a8dc333..47586bcd7003 100644
--- a/docs/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -40,7 +40,7 @@ Deploy the following yaml file `lws.yaml`
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
                     vllm serve meta-llama/Meta-Llama-3.1-405B-Instruct --port 8080 --tensor-parallel-size 8 --pipeline_parallel_size 2"
                 resources:
                   limits:
@@ -73,7 +73,7 @@ Deploy the following yaml file `lws.yaml`
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"
diff --git a/docs/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
index 8a5d18807d06..c23862cde72f 100644
--- a/docs/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -36,7 +36,7 @@ pip install -U vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_langchain.py](../../../examples/online_serving/retrieval_augmented_generation_with_langchain.py)
+1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_langchain.py](../../../examples/applications/rag/retrieval_augmented_generation_with_langchain.py)
 
 1. Run the script
 
@@ -74,7 +74,7 @@ pip install vllm \
     vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
     ```
 
-1. Use the script: [examples/online_serving/retrieval_augmented_generation_with_llamaindex.py](../../../examples/online_serving/retrieval_augmented_generation_with_llamaindex.py)
+1. Use the script: [examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py](../../../examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py)
 
 1. Run the script:
 
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index e9b0d5f0671c..e032769f15e8 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -59,7 +59,7 @@ See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypil
 
       echo 'Starting gradio server...'
       git clone https://github.com/vllm-project/vllm.git || true
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+      python vllm/examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
         -m $MODEL_NAME \
         --port 8811 \
         --model-url http://localhost:8081/v1 \
@@ -305,7 +305,7 @@ It is also possible to access the Llama-3 service with a separate GUI frontend,
 
       echo 'Starting gradio server...'
       git clone https://github.com/vllm-project/vllm.git || true
-      python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
+      python vllm/examples/applications/api_client/gradio_openai_chatbot_webserver.py \
         -m $MODEL_NAME \
         --port 8811 \
         --model-url http://$ENDPOINT/v1 \
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
index 1b214e1a32aa..6c0c8c6c1430 100644
--- a/docs/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@@ -20,7 +20,7 @@ pip install vllm streamlit openai
     vllm serve Qwen/Qwen1.5-0.5B-Chat
     ```
 
-1. Use the script: [examples/online_serving/streamlit_openai_chatbot_webserver.py](../../../examples/online_serving/streamlit_openai_chatbot_webserver.py)
+1. Use the script: [examples/applications/chatbot/streamlit_openai_chatbot_webserver.py](../../../examples/applications/chatbot/streamlit_openai_chatbot_webserver.py)
 
 1. Start the streamlit web UI and start to chat:
 
diff --git a/docs/deployment/integrations/kthena.md b/docs/deployment/integrations/kthena.md
index 483dd7474440..0989e5d67f02 100644
--- a/docs/deployment/integrations/kthena.md
+++ b/docs/deployment/integrations/kthena.md
@@ -78,7 +78,7 @@ Key points from the example YAML:
     - sh
     - -c
     - >
-      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2;
+      bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2;
       python3 -m vllm.entrypoints.openai.api_server
         --port 8080
         --model meta-llama/Llama-3.1-405B-Instruct
@@ -93,7 +93,7 @@ Key points from the example YAML:
     - sh
     - -c
     - >
-      bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
+      bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)
   ```
 
 ---
@@ -144,7 +144,7 @@ spec:
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh leader --ray_cluster_size=2; 
                     python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline-parallel-size 2"
                 resources:
                   limits:
@@ -178,7 +178,7 @@ spec:
                 command:
                   - sh
                   - -c
-                  - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
+                  - "bash /vllm-workspace/examples/ray_serving/multi-node-serving.sh worker --ray_address=$(ENTRY_ADDRESS)"
                 resources:
                   limits:
                     nvidia.com/gpu: "8"
diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index dc4b5402cab6..e711694b3b9f 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -155,6 +155,7 @@ Priority is **1 = highest** (tried first).
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
 | **Head Sizes** | Supported attention head sizes |
 | **Sink** | Attention sink support (for StreamingLLM) |
+| **Non-Causal** | Non-causal (bidirectional) attention support for decoder models |
 | **Sparse** | Sparse attention support (MLA only) |
 | **MM Prefix** | Multimodal prefix full attention support |
 | **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) |
@@ -165,22 +166,22 @@ Priority is **1 = highest** (tried first).
 
 ## Standard Attention (MHA, MQA, GQA) Backends
 
-| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. |
-| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | --------- | --- | --------------- | ------------ |
-| `CPU_ATTN` | | fp16, bf16, fp32 | `auto`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256, 512 | ❌ | ❌ | ❌ | All | N/A |
-| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
-| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `nvfp4` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x |
-| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 |
-| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x |
-| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ✅ | ❌ | ✅ | All | ≥10.0 |
-| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any |
-| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any |
-| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
-| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
-| `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
-| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
-| `TURBOQUANT` | | fp16, bf16 | `turboquant_k8v4`, `turboquant_4bit_nc`, `turboquant_k3v4_nc`, `turboquant_3bit_nc` | 16, 32, 64, 128 | Any | ❌ | ❌ | ❌ | Decoder | Any |
+| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Non-Causal | MM Prefix | DCP | Attention Types | Compute Cap. |
+| ------- | ------- | ------ | --------- | ----------- | ---------- | ---- | ---------- | --------- | --- | --------------- | ------------ |
+| `CPU_ATTN` | | fp16, bf16, fp32 | `auto`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256, 512 | ❌ | ❌ | ❌ | ❌ | All | N/A |
+| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ❌ | ✅ | Decoder | 7.x-9.x |
+| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `nvfp4` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ❌ | ✅ | Decoder | 10.x |
+| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ✅ | ❌ | ✅ | All | ≥8.0 |
+| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | ✅ | All | 9.x |
+| `FLASH_ATTN` | FA4* | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ✅ | ✅ | ❌ | ✅ | All | ≥10.0 |
+| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ✅ | ✅ | ❌ | Decoder, Encoder Only | Any |
+| `ROCM_AITER_FA` | | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32 | 64, 128, 256 | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ❌ | ✅ | ❌ | All | N/A |
+| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
+| `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | ❌ | Decoder | Any |
+| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ❌ | ✅ | ❌ | All | Any |
+| `TURBOQUANT` | | fp16, bf16 | `turboquant_k8v4`, `turboquant_4bit_nc`, `turboquant_k3v4_nc`, `turboquant_3bit_nc` | 16, 32, 64, 128 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | Any |
 
 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
@@ -211,16 +212,16 @@ hardware and configuration.
 MLA decode backends are selected using the standard
 `-ac.backend=<BACKEND>` argument (e.g., `FLASHMLA`, `TRITON_MLA`).
 
-| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
-| ------- | ------ | --------- | ----------- | ---------- | ---- | ------ | --------- | --- | --------------- | ------------ |
-| `CUTLASS_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
-| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
-| `FLASHMLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
-| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 512, 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
-| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
-| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 1, 64 | Any | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
-| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
-| `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
-| `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | Any |
+| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Non-Causal | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. |
+| ------- | ------ | --------- | ----------- | ---------- | ---- | ---------- | ------ | --------- | --- | --------------- | ------------ |
+| `CUTLASS_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x |
+| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x |
+| `FLASHMLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x |
+| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 512, 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x |
+| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x |
+| `ROCM_AITER_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %1 | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | 1, 64 | Any | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | N/A |
+| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | ❌ | Decoder | N/A |
+| `TRITON_MLA` | fp16, bf16 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3` | %16 | Any | ❌ | ❌ | ❌ | ❌ | ✅ | Decoder | Any |
+| `XPU_MLA_SPARSE` | fp16, bf16 | `auto`, `float16`, `bfloat16` | Any | 576 | ❌ | ❌ | ✅ | ❌ | ❌ | Decoder | Any |
diff --git a/docs/design/cuda_graphs_multimodal.md b/docs/design/cuda_graphs_multimodal.md
index e32010232ef0..f44ef359df38 100644
--- a/docs/design/cuda_graphs_multimodal.md
+++ b/docs/design/cuda_graphs_multimodal.md
@@ -86,9 +86,11 @@ Models opt-in to encoder CUDA Graphs by implementing the [SupportsEncoderCudaGra
 | Architecture | Models | CG for Image | CG for Video |
 | ------------ | ------ | ------------ | ------------ |
 | `Qwen3VLForConditionalGeneration` | `Qwen3-VL` | ✅︎ | ✅︎ |
+| `Qwen2_5_VLForConditionalGeneration` | `Qwen2.5-VL` | ✅︎ | ✅︎ |
 
 !!! note
     Encoder CUDA Graphs have currently been tested with `--mm-encoder-attn-backend=FLASH_ATTN` and `--mm-encoder-attn-backend=FLASHINFER` on Blackwell GPUs.
+    For Qwen2.5-VL only FA2 and FA3 has been tested.
 
 ## Configuration
 
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index fbee9f4c3e3e..7edda6fa6476 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -5,12 +5,14 @@ TL;DR:
 - use tlparse to acquire torch.compile logs. Include these logs in bug reports and/or support asks.
 - The vLLM-torch.compile integration is multiple pieces. vLLM exposes flags to turn off each piece:
 
-| Online Flag | Offline Flag | Result |
-| ----------- | ------------ | ------ |
-| --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs |
-| -cc.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only |
-| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only |
-| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor |
+| Online Flag                    | Offline Flag                                                                   | Result                                               |
+|--------------------------------|--------------------------------------------------------------------------------|------------------------------------------------------|
+| --enforce-eager                | enforce_eager=True                                                             | Turn off torch.compile and CUDAGraphs                |
+| -cc.mode=0                     | compilation_config=CompilationConfig(mode=CompilationMode.NONE)                | Turn off torch.compile only                          |
+| -cc.mode=1                     | compilation_config=CompilationConfig(mode=CompilationMode.STOCK_TORCH_COMPILE) | Turn off vLLM-compile modifications to torch.compile |
+| -cc.cudagraph_mode=NONE        | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE)        | Turn off CUDAGraphs only                             |
+| -cc.backend=eager              | compilation_config=CompilationConfig(backend='eager')                          | Turn off TorchInductor                               |
+| -cc.ir_enable_torch_wrap=False | compilation_config=CompilationConfig(ir_enable_torch_wrap=False)               | Turn off vLLM IR wrapping                            |
 
 ## vLLM-torch.compile overview
 
@@ -22,7 +24,7 @@ Most notably, vLLM-compile is NOT torch.compile, it is a custom compiler built u
 
 - Given a model, we do a full graph capture via TorchDynamo that is dynamic on the batch size (number of tokens)
 - vLLM then optionally splits and/or specializes this graph and then uses TorchInductor to compile each graph into a compiled artifact.
-This step may use vLLM custom Inductor passes to further optimize the graph.
+This step may use vLLM custom Inductor passes to further optimize the graph. This includes vLLM IR lowering to remove dispatch overhead.
 - The compiled artifact is saved to vLLM's compile cache so that it can be loaded in the future.
 - vLLM applies CUDAGraphs to reduce CPU overheads.
 
@@ -34,6 +36,7 @@ For more details on the design, please see the following resources:
 
 - [Introduction to vLLM-torch.compile blogpost](https://blog.vllm.ai/2025/08/20/torch-compile.html)
 - [vLLM-torch.compile integration design](./torch_compile.md)
+- [vLLM IR design](./vllm_ir.md)
 - [vLLM Office Hours #26](https://www.youtube.com/live/xLyxc7hxCJc?si=Xulo9pe53C6ywf0V&t=561)
 - [Talk at PyTorch Conference 2025](https://youtu.be/1wV1ESbGrVQ?si=s1GqymUfwiwOrDTg&t=725)
 
@@ -117,6 +120,21 @@ from vllm.config.compilation import CompilationConfig, CUDAGraphMode
 LLM(model, compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE))
 ```
 
+vLLM IR makes heavy use of the compilation pipeline, from functionalization, custom fusions, and lowering.
+To turn that off and capture eager-mode dispatching behavior of vLLM IR, run with `ir_enable_torch_wrap=False`.
+IR torch wrap is only enabled by default when using `mode=VLLM_COMPILE` and `backend="inductor"` (default).
+
+```sh
+# Online
+vllm serve -cc.ir_enable_torch_wrap=False
+```
+
+```py
+# Offline
+from vllm.config.compilation import CompilationConfig
+LLM(model, compilation_config=CompilationConfig(ir_enable_torch_wrap=False))
+```
+
 ## Debugging TorchDynamo
 
 vLLM requires model code be capturable into a full graph via TorchDynamo (torch.compile's frontend).
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 1109abf6cb93..07a2f41cdb71 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 
 Beyond that, there are two more things vLLM depends on Hugging Face for.
 
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Setting `--tokenizer-mode fastokens` swaps in a drop-in Rust BPE backend for the HF fast tokenizer (see [fastokens Tokenizer Mode](../configuration/optimization.md#fastokens-tokenizer-mode)). Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
     - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
diff --git a/docs/design/p2p_nccl_connector.md b/docs/design/p2p_nccl_connector.md
index 4674bef8d2b6..c1de955b6ffe 100644
--- a/docs/design/p2p_nccl_connector.md
+++ b/docs/design/p2p_nccl_connector.md
@@ -88,7 +88,7 @@ pip install "vllm>=0.9.2"
 #### Proxy (e.g. 10.0.1.1)
 
 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+cd {your vllm directory}/examples/disaggregated/p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
@@ -181,7 +181,7 @@ python3 disagg_proxy_p2p_nccl_xpyd.py &
 #### Proxy (e.g. 10.0.1.1)
 
 ```shell
-cd {your vllm directory}/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/
+cd {your vllm directory}/examples/disaggregated/p2p_nccl_xpyd/
 python3 disagg_proxy_p2p_nccl_xpyd.py &
 ```
 
diff --git a/docs/design/vllm_ir.md b/docs/design/vllm_ir.md
new file mode 100644
index 000000000000..82628f3762fe
--- /dev/null
+++ b/docs/design/vllm_ir.md
@@ -0,0 +1,615 @@
+# vLLM IR: Functional Intermediate Representation
+
+## Motivation
+
+vLLM IR is a **functional intermediate representation (IR)** that fills the gap between
+low-level `torch` ops and vLLM layers like `RMSNorm` and quantization operators,
+By separating operator **semantics** from the **implementation** and **dispatching**,
+vLLM IR simplifies both compilation and kernel registration & dispatching simultaneously.
+It operates as a **dialect** in the torch FX representation, allowing full interoperability
+with “regular” torch ops & custom torch ops/kernels, as well as a piecewise migration from
+the previous `CustomOp` approach.
+
+Key design principles:
+
+- **Eager-compile consistency**: identical behavior (barring minor numerics) in eager and compiled modes
+- **Simple, transparent, yet powerful kernel selection**: good visibility and control allowing easy debugging
+- **Convention over configuration**: near-zero boilerplate required to register ops and implementations
+- **Extensibility**: ops and implementations can be registered anywhere, in-tree or out-of-tree
+- **Interoperability**: fully compatible with “regular” torch ops & custom torch ops/kernels,
+reducing developer friction and allowing piecewise migration
+
+The clean semantics/implementation separation enables a unified and extensible dispatching mechanism,
+allowing multiple kernels per-platform and powerful kernel selection. The separation also facilitates
+cleaner testing and benchmarking, removing much of the boilerplate standard for legacy approaches.
+
+By delaying kernel selection until late in the compilation process, the compiler can operate on
+a higher-level representation, which has the following main benefits:
+
+- Pattern matching in fusion/transformation passes only requires a single, simple pattern per op
+- OOT compiler backends can lower from the higher-level representation (in-progress)
+- The compiler can autotune over available implementations (future feature)
+
+## Quick Overview
+
+### Declaring an IR Operation
+
+IR operations are declared using the `@register_op` decorator with a native PyTorch implementation that defines the op's semantics:
+
+```python
+# vllm/ir/ops/layernorm.py
+from torch import Tensor
+from vllm.ir import register_op
+
+@register_op
+def rms_norm(x: Tensor, weight: Tensor | None, epsilon: float, variance_size: int | None = None) -> Tensor:
+    """Weighted root-mean-square layer normalization"""
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x_var = x if variance_size is None else x[..., :variance_size]
+    variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + epsilon)
+    x = x.to(orig_dtype)
+    if weight is not None:
+        x = x * weight
+    return x
+```
+
+The native implementation serves three purposes:
+
+1. **Semantic definition**: Specifies the exact semantics of the operation, including shapes and strides
+2. **Default implementation**: Used when no other (better) implementation is available
+3. **Reference for testing**: Other implementations must match these semantics
+
+### Registering Implementations
+
+Kernel implementations are registered using the `register_impl` decorator on the IR op object:
+
+```python
+# vllm/kernels/vllm_c.py
+from vllm import ir
+
+rms_norm_no_var = lambda x, weight, epsilon, variance_size=None: variance_size is None
+
+@ir.ops.rms_norm.register_impl("vllm_c", supports_args=rms_norm_no_var, supported=current_platform.is_cuda_alike())
+def rms_norm(x: Tensor, weight: Tensor | None, epsilon: float, variance_size: int | None = None) -> Tensor:
+    output = torch.empty_like(x)
+    torch.ops._C.rms_norm(output, x, weight, epsilon)
+    return output
+```
+
+Implementations can specify:
+
+- `supported`: Static boolean indicating if this implementation is available
+- `supports_args`: Function checking if the implementation supports specific arguments
+- `inplace`: Whether this implementation reuses input memory for outputs
+
+### Using IR Operations in Models
+
+IR operations are imported and called directly in model code:
+
+```python
+# vllm/model_executor/layers/layernorm.py
+from vllm import ir
+
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x: Tensor, residual: Tensor | None = None):
+        if residual is None:
+            return ir.ops.rms_norm(x, self.weight, self.variance_epsilon)
+
+        # Use maybe_inplace overload to allow implementation to reuse input memory for outputs
+        # (using x or residual after this call is undefined behavior)
+        return ir.ops.fused_add_rms_norm.maybe_inplace(
+            x, residual, self.weight, self.variance_epsilon
+        )
+```
+
+### Configuring Kernel Selection
+
+Kernel selection is controlled via priority lists in the configuration.
+Priority lists specify the order in which implementations are considered,
+with the first supported implementation being selected.
+This includes the static support check (`supported=...`) and
+the dynamic arg support check (`supports_args=...`).
+
+#### Command Line Configuration
+
+Use `--ir-op-priority.<op_name>=<provider1>,<provider2>,...`:
+
+```bash
+# CUDA: Use vllm_c implementation for rms_norm
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=vllm_c
+
+# ROCm: Try aiter first, fall back to vllm_c, then native
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=aiter,vllm_c,native
+
+# Configure multiple operations
+vllm serve meta-llama/Llama-3.2-1B \
+  --ir-op-priority.rms_norm=vllm_c \
+  --ir-op-priority.fused_add_rms_norm=vllm_c
+```
+
+#### Python Configuration
+
+```python
+from vllm import LLM
+from vllm.config import VllmConfig, KernelConfig
+
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    vllm_config=VllmConfig(
+        kernel_config=KernelConfig(
+            ir_op_priority={
+                "rms_norm": ["vllm_c", "native"],
+                "fused_add_rms_norm": ["vllm_c", "native"],
+            }
+        )
+    )
+)
+```
+
+#### Platform Defaults
+
+Each platform provides default priority lists that are automatically applied:
+
+```python
+# CUDA/XPU/ROCm platform defaults (when compiling with Inductor)
+{
+  "rms_norm": ["native"],  # Native torch is default
+  "fused_add_rms_norm": ["native"],
+}
+
+# CUDA platform defaults (eager or Dynamo-only)
+{
+  "rms_norm": ["vllm_c", "native"],
+  "fused_add_rms_norm": ["vllm_c", "native"],
+}
+
+# ROCm platform defaults (future - currently same as CUDA)
+{
+    "rms_norm": ["aiter", "vllm_c", "native"],
+    "fused_add_rms_norm": ["aiter", "vllm_c", "native"],
+}
+
+# XPU platform defaults (eager or Dynamo-only)
+{
+    "rms_norm": ["xpu_kernels", "native"],
+    "fused_add_rms_norm": ["xpu_kernels", "native"],
+}
+```
+
+User-specified priorities are prepended to platform defaults,
+so you only need to specify the out-of-order implementations,
+other implementations are appended automatically.
+
+## Compilation Pipeline
+
+vLLM IR heavily customizes the `torch.compile`-based compilation process to allow custom compile
+passes to operate on high-level IR while still producing efficient low-level code at the end.
+The compilation pipeline consists of several stages:
+
+### 1. Dynamo Tracing
+
+When `torch.compile` traces the model's forward pass, vLLM IR operations appear as custom operations
+in the `vllm_ir` torch library. These operations are opaque to Dynamo, meaning they appear directly
+in the FX graph without decomposition:
+
+```python
+# Python code (epsilon=1e-5)
+x1 = ir.ops.rms_norm(x, weight, epsilon)
+x2, residual_out = ir.ops.fused_add_rms_norm.maybe_inplace(x1, residual, weight, epsilon)
+
+# FX graph after Dynamo tracing
+x1 = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5); x = None
+out = torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace(x1, residual, weight, 1e-5); x1 = residual = None
+x2 = out[0]
+residual_out = out[1]
+```
+
+### 2. AOTAutograd and Functionalization
+
+AOTAutograd functionalizes the graph, converting any mutating operations to functional equivalents.
+For vLLM IR operations with `maybe_inplace` overloads, we perform this manually before AOTAutograd,
+converting them to the functional `default` overload using the pre-grad custom pass hook.
+
+```python
+# After functionalization
+x1 = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5); x = None
+out = torch.ops.vllm_ir.fused_add_rms_norm.default(x1, residual, weight, 1e-5); x1 = residual = None
+x2 = out[0]
+residual_out = out[1]
+```
+
+The pass also tracks which inputs were "donated" (passed to `maybe_inplace`),
+storing this information in vLLM's `PassContext` for later use in clone elimination.
+
+### 3. IR Fusion and Transformation Passes
+
+After functionalization, custom vLLM passes operate on the functional FX graph containing high-level IR operations.
+These passes can perform fusion, distribute operations for sequence parallelism, and other transformations:
+
+```python
+# Example: Sequence Parallelism (see SequenceParallelismPass)
+# Before SP pass
+
+all_reduce = torch.ops.vllm.all_reduce(x, "tp:0")
+rms_norm = torch.ops.vllm_ir.rms_norm(all_reduce, weight, 1e-5)
+
+# after SP pass
+reduce_scatter = torch.ops.vllm.reduce_scatter(x, "tp:0")
+rms_norm = torch.ops.vllm_ir.rms_norm(all_reduce, weight, 1e-5)
+all_gather = torch.ops.vllm.all_gather(x, "tp:0")
+```
+
+Fusion passes benefit from the high-level representation: they don't need to match against low-level PyTorch operations,
+handle different kernel implementations separately, or deal with functionalization of custom kernels.
+
+### 4. IR Lowering
+
+The lowering pass (`VllmIRLoweringPass`) replaces each vLLM IR operation with its selected implementation.
+The implementation is chosen based on the priority list and support predicates,
+using the **fake tensors** in the graph's metadata in place of op arguments:
+
+```python
+# Implementation selection, same in eager dispatch and compile lowering
+def dispatch(*args) -> IrOpImpl:
+  for provider in priority_list:  # e.g., ["vllm_c", "native"]
+    impl = ir_op.impls[provider]
+    if not impl.supported:
+      continue
+    if impl.supports_args and not impl.supports_args(*args):
+      continue
+    return impl
+
+# make_fx uses torch.fx.symbolic_trace
+impl_graph = make_fx(selected_impl.impl_fn)
+# Replace IR op node with impl_graph's nodes
+match.replace_by_example(selected_impl.impl_fn, node.args)
+```
+
+For example, lowering `rms_norm` with the `vllm_c` implementation:
+
+```python
+# Before lowering (IR op)
+rms_norm = torch.ops.vllm_ir.rms_norm.default(x, weight, 1e-5)
+
+# After lowering (vllm_c implementation traced)
+# Note: Lowering does not currently functionalize, this will likely change in the future.
+empty =  torch.ops.aten.empty.memory_format(x.shape, ...)
+rms_norm = torch.ops._C.rms_norm(empty, x, weight, 1e-5)
+```
+
+When lowering an implementation that mutates inputs (`inplace=True`),
+the lowering pass inserts clones to preserve functional semantics:
+
+```python
+# vllm_c implementation for fused_add_rms_norm mutates its first two arguments
+# Lowered with clones for safety
+clone_default = torch.ops.aten.clone.default(x)
+clone_default_1 = torch.ops.aten.clone.default(residual)
+fused_add_rms_norm = torch.ops._C.fused_add_rms_norm.default(clone_default, clone_default_1, weight, 1e-5)
+```
+
+### 5. Clone Cleanup
+
+After lowering, the clone elimination pass (`UnsafeCloneEliminationPass`) removes unnecessary clones introduced during lowering.
+This pass is essential for achieving zero-copy behavior when using in-place kernels with `maybe_inplace`.
+The pass removes a clone if:
+
+- the cloned input is created in the graph and not used again in the graph
+- the cloned input is a graph parameter, marked as donated
+
+```python
+# After cleanup (donated inputs, no subsequent uses)
+fused_add_rms_norm = torch.ops._C.fused_add_rms_norm.default(x, residual, weight, 1e-5)
+```
+
+The combination of inplace functionalization (tracking donated inputs) and clone cleanup enables the compiler to safely
+use in-place kernels without adding redundant copies or increasing the memory usage.
+
+### 6. Inductor Optimization and Codegen
+
+After IR lowering and cleanup, the graph contains only standard PyTorch operations and platform-specific custom ops.
+Inductor then performs its standard codegen:
+
+- **Inductor lowering and pointwise fusion**: Fusing element-wise operations, reductions, etc.
+- **Memory planning**: Determining buffer allocation and reuse
+- **Kernel generation**: Generating Triton or C++ code for fused operations
+- **Autotuning**: Selecting the best kernel configurations
+
+### Pipeline Summary
+
+```text
+Model Forward Pass
+    ↓
+[Dynamo Tracing] → FX Graph with vllm_ir.* ops
+    ↓
+[Pre-grad: Inplace Functionalization] → maybe_inplace → default, track donated inputs
+    ↓
+[AOTAutograd] → Functionalization
+    ↓
+[Post-grad: IR Fusion Passes] → Fuse high-level IR ops (e.g., rms_norm + quant)
+    ↓
+[Post-grad: IR Lowering] → vllm_ir.* ops → impl ops (with clones if needed)
+    ↓
+[Post-grad: Clone Cleanup] → Remove unnecessary clones using donated input info
+    ↓
+[Inductor] → Pattern matching, fusion, memory planning, codegen
+    ↓
+Compiled Code
+```
+
+## Core vLLM IR Concepts
+
+### Operation Declaration
+
+Operations are declared with the `@register_op` decorator, which creates an `IrOp` object:
+
+```python
+@register_op(
+    name=None,           # Operation name (defaults to function name)
+    activations=None,    # List of activation parameters (defaults to params starting with 'x')
+    allow_inplace=False, # Whether to create a maybe_inplace overload
+)
+def op_name(...):
+    ...
+```
+
+**Parameters:**
+
+- `activations`: List of parameter names considered "activations" (typically consumed by `maybe_inplace`). Defaults to parameters starting with `x`.
+- `allow_inplace`: Creates a `maybe_inplace` overload for memory-efficient execution (see below).
+
+### The `maybe_inplace` Overload
+
+The `maybe_inplace` overload is a critical feature for memory efficiency in LLM inference.
+It signals that the caller doesn't need to preserve the activation inputs after the operation,
+allowing in-place implementations to reuse input memory for outputs.
+
+#### Semantics and Usage
+
+```python
+# Standard usage: inputs are preserved
+out, res_out = ir.ops.fused_add_rms_norm(x, residual, weight, epsilon)
+# x and residual are unchanged, out and res_out are new tensors
+
+# maybe_inplace: inputs may be modified
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x, residual, weight, epsilon)
+# x and residual may be modified (undefined behavior to use them after this)
+# out and res_out may alias x and residual
+```
+
+Using an activation input after passing it to `maybe_inplace` is **undefined behavior**:
+
+```python
+# WRONG: Using x after donating it
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x, residual, weight, epsilon)
+result = out + x  # ERROR: x was donated!
+```
+
+If you need to preserve an input, either use the default overload or clone manually:
+
+```python
+# Option 1: Use default overload
+out, res_out = ir.ops.fused_add_rms_norm(x, residual, weight, epsilon)
+result = out + x  # OK: x is preserved
+
+# Option 2: Clone before maybe_inplace
+out, res_out = ir.ops.fused_add_rms_norm.maybe_inplace(x.clone(), residual, weight, epsilon)
+result = out + x  # OK: x is preserved, clone was donated
+```
+
+#### Compilation Behavior
+
+During compilation, the inplace functionalization pass validates that donated inputs are
+not used again and converts `maybe_inplace` to the functional `default` overload:
+
+```python
+# Inplace functionalization pass (pre-grad)
+for node in graph.nodes:
+    if node.target == torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace:
+        # Check that activation inputs aren't used after this node
+        for activation_arg in activation_inputs:
+            for user in activation_arg.users:
+                if user appears after node:
+                    raise ValueError(f"Input {activation_arg} donated but used again")
+
+        # Convert to default overload
+        node.target = torch.ops.vllm_ir.fused_add_rms_norm.default
+
+        # Track donated graph inputs for later clone elimination
+        for i, arg in enumerate(node.args):
+            if arg.op == "placeholder" and i in activation_indices:
+                pass_context.donated_input_ids.add(node_to_idx[arg])
+```
+
+The donated input information is then used by the clone cleanup pass to eliminate
+unnecessary copies when in-place kernels are lowered.
+
+#### Eager Mode Behavior
+
+In eager mode (without `torch.compile`), `maybe_inplace` enables **maximally memory-efficient**
+execution by allowing the IR operation to dispatch directly to in-place implementations:
+
+```python
+# Eager dispatch logic for maybe_inplace
+impl: IrOpImpl = ir_op.dispatch(*args)
+return impl.impl_fn(*args)
+
+# Eager dispatch logic for default:
+impl: IrOpImpl = ir_op.dispatch(*args)
+if impl.inplace:
+  args = [
+    arg.clone() if i in ir_op.activations else arg
+    for i, arg in enumerate(args)
+  ]
+return impl.impl_fn(*args)
+```
+
+The combination of `maybe_inplace` in model code and in-place kernel implementations provides optimal memory efficiency
+in both eager and compiled modes, with identical semantics in both cases.
+
+#### Memory Savings Example
+
+Consider a transformer layer with residual connections:
+
+```python
+# Without maybe_inplace (2 allocations per layer)
+hidden_states = self.attention(input)
+normed, residual = ir.ops.fused_add_rms_norm(hidden_states, input, weight, eps)
+# Memory: input (preserved), hidden_states (preserved), normed (new), residual (new)
+
+# With maybe_inplace (0 allocations per layer when using in-place kernel)
+hidden_states = self.attention(input)
+normed, residual = ir.ops.fused_add_rms_norm.maybe_inplace(hidden_states, input, weight, eps)
+# Memory: normed (reuses hidden_states), residual (reuses input)
+```
+
+### Implementation Registration
+
+Implementations are registered using the `register_impl` method:
+
+```python
+@ir.ops.op_name.register_impl(
+    provider="provider_name",  # Unique identifier (e.g., "vllm_c", "aiter", "triton")
+    supported=True,            # Static availability check
+    supports_args=None,        # Dynamic argument support check
+)
+def impl_fn(...):
+    ...
+```
+
+**Provider naming conventions:**
+
+- `native`: Reserved for the native torch implementation (declared with `@register_op`)
+- `vllm_c`: C++/CUDA kernels via `torch.ops._C`
+- `aiter`: AMD AITER library
+- `xpu_kernels`: SYCL/SYCLTLA kernels implemented in `vllm-xpu-kernels`
+- `triton_*`: Triton kernels
+- Platform/library names for other implementations
+
+**Support checking:**
+
+- `supported`: Static boolean, checked once at import time (e.g., `HAS_TRITON`, `is_cuda_alike()`)
+- `supports_args`: Function `(*args, **kwargs) -> bool` checking argument compatibility
+    - Called with **fake tensors** during compilation for zero-cost checking
+    - Called with **real tensors** during eager mode dispatch
+    - Should NOT check batch sizes or add guards based on values
+
+Example support predicate:
+
+```python
+def aiter_rms_norm_supports(x, weight, epsilon, variance_size=None):
+    # Check dtype (OK: doesn't depend on batch size)
+    if x.dtype not in [torch.float16, torch.bfloat16]:
+        return False
+    # Check optional parameter (OK: static check)
+    if variance_size is not None:
+        return False
+    return True
+
+@ir.ops.rms_norm.register_impl("aiter", supports_args=aiter_rms_norm_supports)
+def rms_norm(...):
+    ...
+```
+
+Batch-invariant kernels are automatically selected when `VLLM_BATCH_INVARIANT=1` is set.
+
+### Eager Mode vs Compile Mode
+
+vLLM IR operations behave identically in eager and compile modes:
+
+**Eager mode:**
+
+- Direct dispatch to implementation based on priority list
+- Support checked with real tensor arguments
+- Minimal overhead (can be optimized further if needed)
+
+**Compile mode:**
+
+- IR ops appear in FX graph as `torch.ops.vllm_ir.*` custom ops
+- Lowering selects implementation using fake tensors
+- Full integration with Inductor optimizations
+
+This consistency enables:
+
+- Prototyping in eager mode with confidence
+- Debugging by disabling compilation
+- Gradual migration from eager to compiled execution
+
+## Other Topics
+
+### Out-of-Tree Implementations
+
+External platforms can register implementations without modifying vLLM:
+
+```python
+# In external package
+from vllm import ir
+
+@ir.ops.rms_norm.register_impl("my_platform", supported=is_my_platform())
+def rms_norm(x, weight, epsilon, variance_size=None):
+    return my_platform.rms_norm(x, weight, epsilon)
+```
+
+Then configure priority to use your implementation:
+
+```python
+class MyPlatform(Platform):
+  def get_default_ir_op_priority(self):
+    return IrOpPriorityConfig(rms_norm=['my_platform', 'native'])
+
+# Users can still override priority in the same way
+llm = LLM(ir_op_priority=IrOpPriorityConfig(rms_norm=['custom_oot_kernel']))
+```
+
+### Debugging and Observability
+
+!!! note
+    Please let us know how observability can be improved for your use-case!
+
+Enable debug logging to see kernel selection:
+
+```bash
+VLLM_LOGGING_LEVEL=DEBUG vllm serve ...
+```
+
+This logs:
+
+- Which implementations are selected for each operation
+- Why implementations were rejected (unsupported, args not supported)
+- Compilation cache hits/misses
+- IR lowering statistics
+
+Check selected implementations in compiled graphs:
+
+```python
+# After compilation, inspect the lowering pass
+lowering_pass = backend.lowering_pass
+print(lowering_pass.selected_impls)
+# Output: {'rms_norm': {'node_123': 'vllm_c', 'node_456': 'vllm_c'}}
+```
+
+## Migration from CustomOp
+
+vLLM IR is designed to coexist with and gradually replace `CustomOp`:
+
+1. **Op declaration**: Convert `CustomOp` class `PluggableLayer` and move `forward_native` to `@register_op` function
+2. **Implementation registration**: Use `@ir.ops.op_name.register_impl` instead of overriding methods
+3. **Layer usage**: Replace `self.op(...)` with `ir.ops.op_name(...)`
+4. **Configuration**: Migrate `--compilation-config.custom-ops` to `--ir-op-priority`
+
+The migration can be done incrementally, one operation at a time.
+
+## See Also
+
+- [torch.compile Integration](torch_compile.md) - General compilation infrastructure
+- [Fusions](fusions.md) - Custom fusion and transformation passes in vLLM
+- [Custom Operations](custom_op.md) - Legacy custom op system
diff --git a/docs/examples/README.md b/docs/examples/README.md
index f5707ab6eeed..9d6126a65c41 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -1,7 +1,17 @@
 # Examples
 
-vLLM's examples are split into three categories:
+vLLM's examples are organized into the following categories:
 
-- If you are using vLLM from within Python code, see the [Offline Inference](../../examples/offline_inference) section.
-- If you are using vLLM from an HTTP application or client, see the [Online Serving](../../examples/online_serving) section.
-- For examples of using some of vLLM's advanced features (e.g. LMCache or Tensorizer) which are not specific to either of the above use cases, see the [Others](../../examples/others) section.
+- **[`basic/`](../../examples/basic)** – Minimal examples for offline inference and online serving.
+- **[`generate/`](../../examples/generate)** – Text generation examples, including multimodal models.
+- **[`pooling/`](../../examples/pooling)** – Examples for embedding, classification, scoring, reward, etc.
+- **[`speech_to_text/`](../../examples/speech_to_text)** – Speech transcription, translation and real-time audio examples.
+- **[`features/`](../../examples/features)** – Demonstrations of individual vLLM features: automatic prefix caching, speculative decoding, LoRA, structured outputs, prompt embedding, pause/resume, batch invariance, KV events, data parallelism, and more.
+- **[`reasoning/`](../../examples/reasoning)** – Examples for reasoning with vLLM.
+- **[`tool_calling/`](../../examples/tool_calling)** – Examples for function/tool calling with vLLM.
+- **[`applications/`](../../examples/applications)** – Application examples such as chatbots and RAG (Retrieval-Augmented Generation).
+- **[`rl/`](../../examples/rl)** – Reinforcement learning examples.
+- **[`deployment/`](../../examples/deployment)** – Examples for deploying vLLM in production.
+- **[`ray_serving/`](../../examples/ray_serving)** – Scalable serving using Ray.
+- **[`disaggregated/`](../../examples/disaggregated)** – Examples for disaggregated serving (separate prefill and decode), including various kv cache connectors (LMCache, Mooncake, FlexKV, P2P NCCL) and failure recovery.
+- **[`observability/`](../../examples/observability)** – Metrics, logging, tracing (OpenTelemetry), and dashboards (Grafana, Perses).
diff --git a/docs/features/batch_invariance.md b/docs/features/batch_invariance.md
index 804cd905e3b1..b23631484508 100644
--- a/docs/features/batch_invariance.md
+++ b/docs/features/batch_invariance.md
@@ -105,7 +105,7 @@ Batch invariance has been tested and verified on the following models:
 
 - **DeepSeek series**: `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-V3-0324`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`
 - **Qwen3 (Dense)**: `Qwen/Qwen3-1.7B`, `Qwen/Qwen3-8B`, `Qwen/Qwen3-4B-AWQ`, `Qwen/Qwen3-8B-AWQ`
-- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`
+- **Qwen3 (MoE)**: `Qwen/Qwen3-30B-A3B`, `Qwen/Qwen3-Next-80B-A3B-Instruct`, `Qwen/Qwen3-30B-A3B-Thinking-2507-FP8`
 - **Qwen2.5**: `Qwen/Qwen2.5-0.5B-Instruct`, `Qwen/Qwen2.5-1.5B-Instruct`, `Qwen/Qwen2.5-3B-Instruct`, `Qwen/Qwen2.5-7B-Instruct`, `Qwen/Qwen2.5-14B-Instruct`, `Qwen/Qwen2.5-32B-Instruct`
 - **Llama 3**: `meta-llama/Llama-3.1-8B-Instruct`, `meta-llama/Llama-3.2-1B-Instruct`
 - **GPT-OSS**: `openai/gpt-oss-20b`, `openai/gpt-oss-120b`
diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md
index af6da94aa622..c27d6b277284 100644
--- a/docs/features/disagg_encoder.md
+++ b/docs/features/disagg_encoder.md
@@ -36,10 +36,10 @@ The current reference pathway is **ExampleConnector**.
 Below ready-to-run scripts shows the workflow:
 
 1 Encoder instance + 1 PD instance:
-`examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh`
+`examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh`
 
 1 Encoder instance + 1 Prefill instance + 1 Decode instance:
-`examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh`
+`examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh`
 
 ---
 
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 5167d612391d..9ad005be3fed 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -17,15 +17,15 @@ Two main reasons:
 
 ## Usage example
 
-Please refer to [examples/online_serving/disaggregated_prefill.sh](../../examples/online_serving/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
+Please refer to [examples/disaggregated/disaggregated_prefill.sh](../../examples/disaggregated/disaggregated_prefill.sh) for the example usage of disaggregated prefilling.
 
 Now supports 6 types of connectors:
 
-- **ExampleConnector**: refer to [examples/offline_inference/disaggregated-prefill-v1/run.sh](../../examples/offline_inference/disaggregated-prefill-v1/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
-- **LMCacheConnectorV1**: refer to [examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
+- **ExampleConnector**: refer to [examples/disaggregated/example_connector/run.sh](../../examples/disaggregated/example_connector/run.sh) for the example usage of ExampleConnector disaggregated prefilling.
+- **LMCacheConnectorV1**: refer to [examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh](../../examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh) for the example usage of LMCacheConnectorV1 disaggregated prefilling which uses NIXL as the underlying KV transmission.
 - **NixlConnector**: refer to [tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) for the example usage of NixlConnector disaggregated prefilling which support fully async send/recv. For detailed usage guide, see [NixlConnector Usage Guide](nixl_connector_usage.md). For feature compatibility details, see [NixlConnector Compatibility Matrix](nixl_connector_compatibility.md).
-- **P2pNcclConnector**: refer to [examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
-- **MooncakeConnector**: refer to [examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh) for the example usage of ExampleConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
+- **P2pNcclConnector**: refer to [examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh](../../examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh) for the example usage of P2pNcclConnector disaggregated prefilling.
+- **MooncakeConnector**: refer to [examples/disaggregated/mooncake_connector/run_mooncake_connector.sh](../../examples/disaggregated/mooncake_connector/run_mooncake_connector.sh) for the example usage of MooncakeConnector disaggregated prefilling. For detailed usage guide, see [MooncakeConnector Usage Guide](mooncake_connector_usage.md).
 - **MultiConnector**: take advantage of the kv_connector_extra_config: dict[str, Any] already present in KVTransferConfig to stash all the connectors we want in an ordered list of kwargs.such as:
 
   ```bash
@@ -44,7 +44,7 @@ For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
   --kv-transfer-config '{"kv_connector":"OffloadingConnector","kv_role":"kv_both","kv_connector_extra_config":{"block_size": 64, "cpu_bytes_to_use": 1000000000}}'
   ```
 
-- **FlexKVConnectorV1**: refer to [examples/offline_inference/prefix_caching_flexkv.py](../../examples/offline_inference/prefix_caching_flexkv.py) for the example usage of FlexKVConnectorV1. FlexKV is a distributed KV Store and multi-level cache management system for ultra-large-scale LLM inference.
+- **FlexKVConnectorV1**: refer to [examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py](../../examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py) for the example usage of FlexKVConnectorV1. FlexKV is a distributed KV Store and multi-level cache management system for ultra-large-scale LLM inference.
 
   ```bash
   --kv-transfer-config '{"kv_connector":"FlexKVConnectorV1","kv_role":"kv_both"}'
diff --git a/docs/features/mooncake_connector_usage.md b/docs/features/mooncake_connector_usage.md
index 0e2478924ead..cc8c8ecff258 100644
--- a/docs/features/mooncake_connector_usage.md
+++ b/docs/features/mooncake_connector_usage.md
@@ -31,7 +31,7 @@ vllm serve Qwen/Qwen2.5-7B-Instruct --port 8020 --kv-transfer-config '{"kv_conne
 ### Proxy
 
 ```bash
-python examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
+python examples/disaggregated/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py --prefill http://192.168.0.2:8010 --decode http://192.168.0.3:8020
 ```
 
 Now you can send requests to the proxy server through port 8000.
@@ -65,5 +65,5 @@ Now you can send requests to the proxy server through port 8000.
 
 Refer to these example scripts in the vLLM repository:
 
-- [run_mooncake_connector.sh](../../examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh)
-- [mooncake_connector_proxy.py](../../examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py)
+- [run_mooncake_connector.sh](../../examples/disaggregated/mooncake_connector/run_mooncake_connector.sh)
+- [mooncake_connector_proxy.py](../../examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py)
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index ea668615153c..e7ee3a64008c 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -13,7 +13,7 @@ Install the NIXL library: `uv pip install nixl`, as a quick start on Nvidia plat
 - Refer to [NIXL official repository](https://github.com/ai-dynamo/nixl) for more installation instructions
 - The specified required NIXL version can be found in [requirements/kv_connectors.txt](../../requirements/kv_connectors.txt) and other relevant config files
 
-For ROCm platform, the [base ROCm docker file](../../docker/Dockerfile.rocm_base) includes RIXL and ucx already.
+For ROCm platform, the [ROCm docker file](../../docker/Dockerfile.rocm) includes RIXL and ucx already.
 
 - Refer to [RIXL official repository](https://github.com/rocm/rixl) for more information
 - The supportive libraries for RIXL can be found in [requirements/kv_connectors_rocm.txt](../../requirements/kv_connectors_rocm.txt)
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
index e77e8b5a1f41..e93005f26321 100644
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -47,7 +47,7 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```bash
-python examples/offline_inference/llm_engine_example.py \
+python examples/deployment/llm_engine_example.py \
     --model TheBloke/Llama-2-7b-Chat-AWQ \
     --quantization awq
 ```
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
index f14a931725da..636a952b6551 100644
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -58,7 +58,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```bash
-python examples/offline_inference/llm_engine_example.py \
+python examples/deployment/llm_engine_example.py \
     --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 374149786e14..f1cc18a25cbb 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -157,7 +157,7 @@ OpenAI Python client library does not officially support `reasoning` attribute f
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/reasoning/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 
diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md
index 8c282582281e..e8b74a06f079 100644
--- a/docs/getting_started/installation/gpu.xpu.inc.md
+++ b/docs/getting_started/installation/gpu.xpu.inc.md
@@ -88,7 +88,7 @@ vllm serve facebook/opt-13b \
      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/online_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/run_cluster.sh) helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the [examples/ray_serving/run_cluster.sh](https://github.com/vllm-project/vllm/blob/main/examples/ray_serving/run_cluster.sh) helper script.
 
 --8<-- [end:supported-features]
 --8<-- [start:distributed-backend]
diff --git a/docs/maybe_skip_pr_build.sh b/docs/maybe_skip_pr_build.sh
deleted file mode 100755
index 2a0b338a0198..000000000000
--- a/docs/maybe_skip_pr_build.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: Apache-2.0
-# Skip PR builds unless the PR has the "documentation" or "ready" label.
-# Used by Read the Docs (see .readthedocs.yaml).
-
-if [[ "$READTHEDOCS_VERSION_TYPE" != "external" ]]; then
-  exit 0
-fi
-
-PR_URL="https://api.github.com/repos/vllm-project/vllm/pulls/${READTHEDOCS_VERSION}"
-CURL_ARGS=(-s -o /tmp/pr_response.json -w "%{http_code}")
-if [[ -n "$GITHUB_TOKEN" ]]; then
-  CURL_ARGS+=(-H "Authorization: token ${GITHUB_TOKEN}")
-fi
-HTTP_CODE=$(curl "${CURL_ARGS[@]}" "$PR_URL")
-
-if [[ "$HTTP_CODE" -ne 200 ]]; then
-  echo "GitHub API returned HTTP ${HTTP_CODE}, proceeding with build."
-elif grep -qE '"name": *"(documentation|ready)"' /tmp/pr_response.json; then
-  echo "Found required label, proceeding with build."
-else
-  echo "PR #${READTHEDOCS_VERSION} lacks 'documentation' or 'ready' label, cancelling build."
-  exit 1
-fi
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index 3266b80e5dc0..2c19dc1763f6 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -7,7 +7,7 @@
 import textwrap
 import traceback
 from argparse import SUPPRESS, Action, HelpFormatter
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from importlib.machinery import ModuleSpec
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
@@ -48,6 +48,7 @@ def decorator(cls):
 
 
 mock_if_no_torch("vllm._C", MagicMock())
+mock_if_no_torch("vllm._C_stable_libtorch", MagicMock())
 mock_if_no_torch(
     "vllm.model_executor.custom_op",
     MagicMock(CustomOp=MockCustomOp, PluggableLayer=MockPluggableLayer),
@@ -67,6 +68,31 @@ def decorator(cls):
 mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
+# Mock torch.library.infer_schema for vllm.ir.ops.IrOpInplaceOverload.__init__
+# We need to return the corresponding number of inputs, as IR infra will assert it
+def get_outputs(native_fn: Callable) -> str:
+    """
+    Extract output schema from function's return type annotation,
+    e.g. 'Tensor' or 'Tensor, Tensor'.
+    """
+    import typing
+
+    return_type = typing.get_type_hints(native_fn)["return"]
+    origin = typing.get_origin(return_type)
+    arg_name = lambda a: a.__name__ if hasattr(a, "__name__") else str(a)
+    if origin is tuple:
+        args = typing.get_args(return_type)
+        return ", ".join(arg_name(arg) for arg in args)
+    else:
+        return f"{arg_name(return_type)}"
+
+
+mock_if_no_torch(
+    "torch.library",
+    MagicMock(infer_schema=lambda fn, **k: f"(Tensor x) -> {get_outputs(fn)}"),
+)
+
+
 class PydanticMagicMock(MagicMock):
     """`MagicMock` that's able to generate pydantic-core schemas."""
 
diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
index 3df80d5af6c4..0f3ee1ede436 100644
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`.
 ## The basics
 
 To load a model using Tensorizer, the model first needs to be serialized by
-Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process.
+Tensorizer. [The example script](../../../examples/features/tensorize_vllm_model.py) takes care of this process.
 
 Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference.
 
@@ -25,7 +25,7 @@ CLI arguments. The docstring for the script itself explains the CLI args
 and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -35,7 +35,7 @@ python examples/others/tensorize_vllm_model.py \
 This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    --lora-path <lora_id> \
    serialize \
@@ -71,7 +71,7 @@ llm = LLM(
 As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing:
 
 ```bash
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    --lora-path <lora_id> \
    serialize \
diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md
index a4424642cd2a..baaf15f14fb4 100644
--- a/docs/models/pooling_models/scoring.md
+++ b/docs/models/pooling_models/scoring.md
@@ -41,6 +41,7 @@ The score models is designed to compute similarity scores between two input prom
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ |
 | `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | |
 | `LlamaBidirectionalForSequenceClassification`<sup>C</sup> | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ |
+| `ModernBertForSequenceClassification` | ModernBERT-based | `Alibaba-NLP/gte-reranker-modernbert-base`, etc. | N/A | | |
 | `Qwen2ForSequenceClassification`<sup>C</sup> | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification`<sup>C</sup> | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | |
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 97c08a8b62aa..e79fec8169f2 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -614,6 +614,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ |
 | `Phi4ForCausalLMV` | Phi-4-reasoning-vision | T + I<sup>+</sup> | `microsoft/Phi-4-reasoning-vision-15B`, etc. | | ✅︎ |
 | `PixtralForConditionalGeneration` | Ministral 3 (Mistral format), Mistral 3 (Mistral format), Mistral Large 3 (Mistral format), Pixtral (Mistral format) | T + I<sup>+</sup> | `mistralai/Ministral-3-3B-Instruct-2512`, `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistralai/Mistral-Large-3-675B-Instruct-2512` `mistralai/Pixtral-12B-2409` etc. | ✅︎ | ✅︎ |
+| `QianfanOCRForConditionalGeneration` | QianfanOCR | T + I<sup>E+</sup> | `baidu/Qianfan-OCR`, etc. | ✅︎ | ✅︎ |
 | `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL | T + I<sup>E+</sup> | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A<sup>+</sup> | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ |
 | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ |
diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md
index 7b963b99d565..1f18b92f95b4 100644
--- a/docs/serving/data_parallel_deployment.md
+++ b/docs/serving/data_parallel_deployment.md
@@ -98,7 +98,7 @@ For larger scale deployments especially, it can make sense to handle the orchest
 
 In this case, it's more convenient to treat each DP rank like a separate vLLM deployment, with its own endpoint, and have an external router balance HTTP requests between them, making use of appropriate real-time telemetry from each server for routing decisions.
 
-This can already be done trivially for non-MoE models, since each deployed server is fully independent. No data parallel CLI options need to be used for this.
+This can already be done trivially for non-MoE models, since each deployed server is fully independent. In that case, launch independent vLLM instances without any `--data-parallel-*` arguments; external DP CLI options are only supported for MoE deployments.
 
 We support an equivalent topology for MoE DP+EP which can be configured via the following CLI arguments.
 
diff --git a/docs/serving/distributed_troubleshooting.md b/docs/serving/distributed_troubleshooting.md
index b5354a7e55d5..e6dde4944284 100644
--- a/docs/serving/distributed_troubleshooting.md
+++ b/docs/serving/distributed_troubleshooting.md
@@ -4,11 +4,11 @@ For general troubleshooting, see [Troubleshooting](../usage/troubleshooting.md).
 
 ## Verify inter-node GPU communication
 
-After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
+After you start the Ray cluster, verify GPU-to-GPU communication across nodes. Proper configuration can be non-trivial. For more information, see [troubleshooting script](../usage/troubleshooting.md#incorrect-hardwaredriver). If you need additional environment variables for communication configuration, append them to [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh), for example `-e NCCL_SOCKET_IFNAME=eth0`. Setting environment variables during cluster creation is recommended because the variables propagate to all nodes. In contrast, setting environment variables in the shell affects only the local node. For more information, see <https://github.com/vllm-project/vllm/issues/6803>.
 
 ## No available node types can fulfill resource request
 
-The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.
+The error message `Error: No available node types can fulfill resource request` can appear even when the cluster has enough GPUs. The issue often occurs when nodes have multiple IP addresses and vLLM can't select the correct one. Ensure that vLLM and Ray use the same IP address by setting `VLLM_HOST_IP` in [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) (with a different value on each node). Use `ray status` and `ray list nodes` to verify the chosen IP address. For more information, see <https://github.com/vllm-project/vllm/issues/7815>.
 
 ## Ray observability
 
diff --git a/docs/serving/integrations/codex.md b/docs/serving/integrations/codex.md
new file mode 100644
index 000000000000..48148acfd51f
--- /dev/null
+++ b/docs/serving/integrations/codex.md
@@ -0,0 +1,88 @@
+# Codex
+
+[Codex](https://github.com/openai/codex) is OpenAI's official agentic coding tool that lives in your terminal. It can understand your codebase, edit files, run commands, and help you write code more efficiently.
+
+By pointing Codex at a vLLM server, you can use your own models as the backend instead of the OpenAI API. This is useful for:
+
+- Running fully local/private coding assistance
+- Using open-weight models with tool calling capabilities
+- Testing and developing with custom models
+
+## How It Works
+
+vLLM implements the OpenAI-Responses API, which is the same API that Codex uses to communicate with OpenAI's servers. By configuring Codex to point at your vLLM server, Codex sends its requests to vLLM instead of OpenAI. vLLM then translates these requests to work with your local model and returns responses in the format Codex expects.
+
+This means any model served by vLLM with proper tool calling support can act as a drop-in replacement for OpenAI models in Codex.
+
+## Requirements
+
+Codex requires a model with strong tool calling capabilities. The model must support the OpenAI-Responses tool calling API. See [Tool Calling](../../features/tool_calling.md) for details on enabling tool calling for your model.
+
+## Installation
+
+First, install Codex by following the [official installation guide](https://github.com/openai/codex).
+
+## Starting the vLLM Server
+
+Start vLLM with a tool-calling capable model - here's an example using `Qwen/Qwen3-27B`:
+
+```bash
+vllm serve Qwen/Qwen3.6-27B --port 8000 --tensor-parallel-size 8 --max-model-len 262144 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder
+
+```
+
+For other models, you'll need to enable tool calling explicitly with `--enable-auto-tool-choice` and the right `--tool-call-parser`. Refer to the [Tool Calling documentation](../../features/tool_calling.md) for the correct flags for your model.
+
+## Configuring Codex
+
+Codex is configured via a TOML file located at `~/.codex/config.toml`. Create or edit this file to point Codex at your vLLM server:
+
+```toml
+model = "my-model"
+model_provider = "vllm"
+
+[model_providers.vllm]
+name = "vLLM"
+env_key = "VLLM_API_KEY"
+base_url = "http://localhost:8000/v1"
+wire_api = "responses"
+```
+
+The configuration fields:
+
+| Field | Description |
+| ----- | ----------- |
+| `model` | The model name to use. Must match the `--served-model-name` you passed to vLLM. |
+| `model_provider` | Set to `"vllm"` to use your local vLLM server. |
+| `[model_providers.vllm]` | Configuration section for the vLLM provider. |
+| `name` | A display name for your vLLM provider. |
+| `env_key` | The name of an environment variable that Codex will read for the API key. vLLM does not require authentication by default, so this can be any value. |
+| `base_url` | The URL of your vLLM server's OpenAI-compatible API endpoint (default is `http://localhost:8000/v1`). |
+| `wire_api` | The API style to use. Set to `"responses"` for the OpenAI Responses API |
+
+!!! tip
+    You can set the `env_key` to any dummy environment variable since vLLM doesn't require authentication by default:
+    ```bash
+    export VLLM_API_KEY=dummy
+    ```
+
+!!! warning
+    When using the `responses` API, ensure your vLLM version supports the OpenAI Responses API.
+
+## Testing the Setup
+
+Once Codex is configured, launch it in your project directory:
+
+```bash
+codex
+```
+
+Try a simple prompt to verify the connection, such as asking it to explain a file in your project. If the model responds correctly, your setup is working. You can now use Codex with your vLLM-served model for coding tasks.
+
+## Troubleshooting
+
+**Connection refused**: Ensure vLLM is running and accessible at the specified URL. Check that the port matches and that `base_url` includes the `/v1` path suffix.
+
+**Tool calls not working**: Verify that your model supports tool calling and that you've enabled it with the correct `--tool-call-parser` flag. See [Tool Calling](../../features/tool_calling.md).
+
+**Model not found**: Ensure the `model` field in `~/.codex/config.toml` matches the `--served-model-name` you passed to vLLM.
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 59f02a006567..973c2ecdcb52 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -251,7 +251,7 @@ The following extra parameters are supported:
 Our Responses API is compatible with [OpenAI's Responses API](https://platform.openai.com/docs/api-reference/responses);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-Code example: [examples/online_serving/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
+Code example: [examples/tool_calling/openai_responses_client_with_tools.py](../../examples/tool_calling/openai_responses_client_with_tools.py)
 
 #### Extra parameters
 
@@ -456,8 +456,8 @@ Audio must be sent as base64-encoded PCM16 audio at 16kHz sample rate, mono chan
 
 #### Example Clients
 
-- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_client.py) - Upload and transcribe an audio file
-- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/online_serving/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
+- [openai_realtime_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_client.py) - Upload and transcribe an audio file
+- [openai_realtime_microphone_client.py](https://github.com/vllm-project/vllm/tree/main/examples/speech_to_text/realtime/openai_realtime_microphone_client.py) - Gradio demo for live microphone transcription
 
 ### Tokenizer API
 
@@ -542,6 +542,6 @@ Key capabilities:
 - Scales from a single GPU to a multi-node cluster without code changes.
 - Provides observability and autoscaling policies through Ray dashboards and metrics.
 
-The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/online_serving/ray_serve_deepseek.py](../../examples/online_serving/ray_serve_deepseek.py).
+The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: [examples/ray_serving/ray_serve_deepseek.py](../../examples/ray_serving/ray_serve_deepseek.py).
 
 Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/index.html).
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index b69ca17e8334..0f86a256727c 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -78,7 +78,7 @@ For details, see the [Ray documentation](https://docs.ray.io/en/latest/index.htm
 
 ### Ray cluster setup with containers
 
-The helper script [examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
+The helper script [examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) starts containers across nodes and initializes Ray. By default, the script runs Docker without administrative privileges, which prevents access to the GPU performance counters when profiling or tracing. To enable admin privileges, add the `--cap-add=CAP_SYS_ADMIN` flag to the Docker command.
 
 Choose one node as the head node and run:
 
@@ -162,7 +162,7 @@ vllm serve /path/to/the/model/in/the/container \
 
 Efficient tensor parallelism requires fast internode communication, preferably through high-speed network adapters such as InfiniBand.
 To set up the cluster to use InfiniBand, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the
-[examples/online_serving/run_cluster.sh](../../examples/online_serving/run_cluster.sh) helper script.
+[examples/ray_serving/run_cluster.sh](../../examples/ray_serving/run_cluster.sh) helper script.
 Contact your system administrator for more information about the required flags.
 
 ## Enabling GPUDirect RDMA
diff --git a/docs/training/routed_experts_replay.md b/docs/training/routed_experts_replay.md
new file mode 100644
index 000000000000..60eae33b848e
--- /dev/null
+++ b/docs/training/routed_experts_replay.md
@@ -0,0 +1,289 @@
+# Routed Experts Replay
+
+## Overview
+
+Routed experts replay captures which MoE (Mixture of Experts) experts process each token during inference and returns this information alongside the generated text. This is essential for **reinforcement learning (RL) training pipelines** (such as GRPO and RLHF) where the training step needs to reconstruct expert routing decisions from the inference pass.
+
+When enabled, each API response includes:
+
+- **`prompt_routed_experts`**: A `[prompt_len, num_moe_layers, top_k]` array of expert IDs for the prompt tokens (at the response level, shared across completions).
+- **`routed_experts`**: A `[gen_len, num_moe_layers, top_k]` array of expert IDs for the generated tokens (per completion).
+
+For example, a model with 40 MoE layers and top-22 routing that processes a 100-token prompt and generates 50 tokens would return:
+
+- `prompt_routed_experts`: shape `[100, 40, 22]`
+- `routed_experts`: shape `[50, 40, 22]`
+
+Each value is an int16 expert ID in the range `[0, num_experts)`.
+
+## Quickstart
+
+### OpenAI API Server
+
+```bash
+vllm serve <MODEL> \
+    --enable-return-routed-experts \
+    --tensor-parallel-size 4 \
+    --enable-expert-parallel
+```
+
+Then query the `/v1/completions` endpoint as usual. The response includes routing data:
+
+```python
+import requests
+
+resp = requests.post("http://localhost:8000/v1/completions", json={
+    "model": "<MODEL>",
+    "prompt": "Explain quantum computing.",
+    "max_tokens": 64,
+    "temperature": 0.0,
+}).json()
+
+# Generation routing (per completion choice)
+gen_routing = resp["choices"][0]["routed_experts"]  # [gen_len, layers, top_k]
+
+# Prompt routing (shared across all choices)
+prompt_routing = resp["prompt_routed_experts"]      # [prompt_len, layers, top_k]
+
+print(f"Prompt routing shape: [{len(prompt_routing)}, "
+      f"{len(prompt_routing[0])}, {len(prompt_routing[0][0])}]")
+print(f"Gen routing shape: [{len(gen_routing)}, "
+      f"{len(gen_routing[0])}, {len(gen_routing[0][0])}]")
+```
+
+### Python SDK (Offline Inference)
+
+```python
+from vllm import LLM, SamplingParams
+
+llm = LLM(
+    model="<MODEL>",
+    enable_return_routed_experts=True,
+    tensor_parallel_size=4,
+    enable_expert_parallel=True,
+)
+
+outputs = llm.generate(
+    ["Explain quantum computing."],
+    SamplingParams(temperature=0, max_tokens=64),
+)
+
+result = outputs[0]
+
+# Prompt routing: numpy array, shape [prompt_len, num_moe_layers, top_k]
+prompt_routing = result.prompt_routed_experts
+print(f"Prompt routing: {prompt_routing.shape}, dtype={prompt_routing.dtype}")
+
+# Generation routing: numpy array, shape [gen_len, num_moe_layers, top_k]
+gen_routing = result.outputs[0].routed_experts
+print(f"Gen routing: {gen_routing.shape}, dtype={gen_routing.dtype}")
+```
+
+## Output Format
+
+### `CompletionOutput.routed_experts`
+
+- **Type**: `numpy.ndarray` (Python SDK) or `list[list[list[int]]]` (JSON API)
+- **Shape**: `[gen_len, num_moe_layers, top_k]`
+- **Dtype**: `int16`
+- **Content**: Expert IDs for **generated tokens only**. `gen_len` matches the number of generated tokens (i.e., `usage.completion_tokens` or fewer).
+
+### `RequestOutput.prompt_routed_experts`
+
+- **Type**: `numpy.ndarray` (Python SDK) or `list[list[list[int]]]` (JSON API)
+- **Shape**: `[prompt_len, num_moe_layers, top_k]`
+- **Dtype**: `int16`
+- **Content**: Expert IDs for **prompt tokens only**. `prompt_len` matches `usage.prompt_tokens`. This field lives on the request-level response (not per-choice), because prompt routing is shared across all completions when `n > 1`.
+
+### Why Separate Prompt and Generation Routing?
+
+When a request has multiple completions (`n > 1`), each completion shares the same prompt but produces different generated text. Storing prompt routing once on the `RequestOutput` (rather than duplicating it on every `CompletionOutput`) avoids redundant data. For RL training, the consumer typically needs:
+
+1. The prompt routing (once) to reconstruct the forward pass for the shared prefix.
+2. The per-completion generation routing to reconstruct each completion's forward pass.
+
+## Architecture
+
+### Data Flow
+
+```text
+Forward Pass                    Async D2H Pipeline              Output
+─────────────                   ──────────────────              ──────
+FusedMoE layer                  After forward pass:             On request finish:
+writes topk_ids    ──────►     D2H copy to pinned     ──────►  Extract from host cache
+to device buffer               staging buffer                   Split at prompt_len
+(L, N, K) int16                 (via CUDA stream)                Trim gen to output len
+                                Scatter to per-request           Serialize to API response
+                                host cache (numpy)
+```
+
+### Device Cache
+
+A pre-allocated GPU buffer with layout `(L, N, K)` where:
+
+- `L` = number of MoE layers
+- `N` = `max_num_batched_tokens`
+- `K` = `num_experts_per_tok` (top-k)
+
+The `(L, N, K)` layout ensures that `buffer[layer_id]` gives a contiguous `(N, K)` view per layer. Each `FusedMoE` layer gets a persistent reference to its slice via `module._routing_replay_out = buffer[layer_id]`.
+
+**Dtype**: `int16` — sufficient for expert IDs (max ~512 experts in practice) and half the memory of `int32`.
+
+### Host Cache
+
+Per-request numpy arrays for accumulating routing data across decode steps. Each request gets a lazily allocated `(seq_len, L, K)` int16 buffer that grows as the sequence lengthens. Buffers are freed when a request completes.
+
+### Async D2H Pipeline
+
+After each forward pass, the model runner issues a non-blocking device-to-host copy on a dedicated CUDA stream:
+
+1. **Copy**: `pinned_staging[:, :total_tokens, :].copy_(device_buffer[:, :total_tokens, :])` on a separate stream, recorded with a CUDA event.
+2. **Scatter** (deferred to next step): On the *next* forward pass, synchronize the event (effectively free — an entire forward pass has elapsed) and scatter the staging data into per-request host cache buffers using the token positions.
+
+This design ensures the D2H copy overlaps with the next forward pass, minimizing GPU stall time.
+
+### CUDA Graph Compatibility
+
+CUDA graph compatibility requires two mechanisms:
+
+1. **Persistent tensor attribute**: Each `FusedMoE` layer stores a reference to its buffer slice as `module._routing_replay_out`. Because `torch.compile` captures module attributes by reference, graph replay always writes to the live buffer — not a stale snapshot.
+
+2. **Static marking**: Both the full `(L, N, K)` buffer and each per-layer `(N, K)` view are marked with `cudagraph_mark_tensor_static()`. This prevents CUDA graphs from snapshot/restore behavior that would zero the buffer on replay.
+
+### Multi-Node Support
+
+On multi-node tensor-parallel setups, all TP ranks allocate a device buffer (required for symmetric CUDA graph structure), but only TP rank 0 runs the D2H pipeline and host cache. Routing data flows from the model runner through `ModelRunnerOutput` via Ray DAG to the scheduler — no shared memory or file locks needed.
+
+### Routing Capture Path
+
+For the **non-monolithic (Triton) kernel path** (e.g., BF16 MoE), routing is captured after `select_experts()` in the MoE runner:
+
+```python
+routing_replay_out = getattr(layer, "_routing_replay_out", None)
+topk_weights, topk_ids = self.router.select_experts(...)
+
+if routing_replay_out is not None:
+    routing_replay_out[:topk_ids.shape[0]].copy_(topk_ids.to(torch.int16))
+```
+
+For the **monolithic kernel path** (e.g., FP8/MXFP8 via FlashInfer), `routing_replay_out` is threaded through the `apply_monolithic()` call chain and FlashInfer writes expert IDs directly during routing inside the fused kernel.
+
+### MTP (Multi-Token Prediction) Handling
+
+With MTP speculative decoding, the model captures routing for all tokens including speculative ones that may later be rejected. When a request finishes, the generation routing is trimmed to match the actual number of accepted output tokens:
+
+```python
+num_gen = self.detokenizer.num_output_tokens()
+if gen_routed_experts.shape[0] > num_gen and num_gen > 0:
+    gen_routed_experts = gen_routed_experts[:num_gen]
+```
+
+This ensures the routing array length always matches the token IDs in the response.
+
+## Design Decisions
+
+### Why Replace SharedMemory with Device Cache?
+
+The previous implementation used `multiprocessing.SharedMemory` with `fcntl` file locking to transfer routing data from GPU workers to the scheduler. This approach had fundamental problems:
+
+- **Multi-node**: `SharedMemory` is node-local. On multi-node TP setups (required for 400B+ parameter models), the scheduler on node 0 cannot read shared memory from workers on other nodes.
+- **Performance**: Synchronous `.cpu().numpy()` D2H transfers block the GPU. File-based locking adds further overhead.
+- **CUDA graphs**: The callback-based capture mechanism bakes tensor references at trace time, causing stale data on graph replay.
+
+The device cache approach solves all three: data flows through Ray DAG (works multi-node), D2H is async (non-blocking), and persistent tensor attributes work with CUDA graphs.
+
+### Why `(L, N, K)` Layout Instead of `(N, L, K)`?
+
+FlashInfer's `routing_replay_out` parameter expects a contiguous `(N, K)` tensor per layer. With `(L, N, K)` layout, `buffer[layer_id]` gives a contiguous `(N, K)` view with zero-copy slicing. The previous `(N, L, K)` layout would require non-contiguous indexing or an explicit copy.
+
+### Why int16 Instead of int32?
+
+Expert IDs are small integers (typically 0-255 for models with up to 256 experts). `int16` supports up to 32,767 experts — far more than any current model — while halving GPU memory usage and D2H bandwidth compared to `int32`.
+
+### Why Split Prompt and Generation Routing?
+
+RL training pipelines process prompt and generation routing separately:
+
+- Prompt routing reconstructs the shared forward pass for the input.
+- Generation routing reconstructs each sampled trajectory.
+
+With `n > 1` completions, all completions share the same prompt routing. Duplicating it per completion would waste memory proportional to `n * prompt_len * L * K`. Instead, `prompt_routed_experts` is stored once on `RequestOutput` and shared.
+
+### Why Async D2H Instead of Synchronous Copy?
+
+A synchronous `.cpu()` call forces the GPU to drain its command queue before the copy can begin, stalling the pipeline. The async approach:
+
+1. Issues the copy on a separate CUDA stream (non-blocking to the main compute stream).
+2. Defers the host-side scatter to the *next* step, by which time the copy has finished.
+
+This means the D2H transfer overlaps entirely with the next forward pass, adding near-zero latency to the critical path.
+
+### Why All TP Ranks Get a Device Buffer?
+
+CUDA graph capture records the exact sequence of kernel calls and their arguments. If only rank 0 had a device buffer, the `FusedMoE` layer would take a different code path on rank 0 vs. other ranks (one writes to a buffer, others don't). This asymmetry causes different CUDA graph structures across ranks, which can lead to NCCL deadlocks during collective operations inside the graph. Giving all ranks a real buffer ensures symmetric graph structure. Only rank 0 does the D2H copy and host cache management.
+
+## Performance
+
+Routing replay adds a small overhead from the device buffer writes and async D2H copies. On tested configurations:
+
+- **Throughput overhead** (random data, ISL=1024, OSL=1024): **~2%**
+- **Memory overhead** (int16 buffer, 40 layers, 8192 tokens, top-22): **~14 MB per GPU**
+- **Accuracy impact** (GSM8K): **Zero** (pass@1 identical with and without routing replay)
+
+The overhead is dominated by the per-layer `.copy_()` during the forward pass. The async D2H pipeline runs entirely in the background.
+
+## Supported Configurations
+
+| Configuration                            | Supported                                                 |
+|------------------------------------------|-----------------------------------------------------------|
+| BF16 Triton MoE (non-monolithic)         | Yes                                                       |
+| FP8/MXFP8 FlashInfer MoE (monolithic)    | Yes (requires FlashInfer with `routing_replay_out`)       |
+| CUDA graphs                              | Yes                                                       |
+| Multi-node tensor parallelism            | Yes                                                       |
+| Data parallelism (DP)                    | Yes                                                       |
+| Expert parallelism (EP)                  | Yes                                                       |
+| Prefix caching                           | Yes (cached positions marked with `-1` sentinel)          |
+| MTP speculative decoding                 | Yes (gen routing trimmed to accepted tokens)              |
+| `n > 1` (multiple completions)           | Yes (prompt routing shared, gen routing per-completion)   |
+
+## Limitations
+
+- **Streaming**: Routing data is only available when the request finishes (not streamed incrementally).
+- **V1 engine only**: Routing replay is implemented for the vLLM V1 engine.
+- **Preempted requests**: When a request is preempted by the scheduler (and later resumed via re-prefill), any routing already accumulated in the worker's host cache for that request is dropped without being emitted. The consumer sees `routed_experts=None` for the resumed request with no other signal. Partial-rollout and async-RL pipelines that rely on routing for preempted requests should either disable preemption (`--no-enable-chunked-prefill` / sufficient KV headroom) or reconstruct routing on the resumed prefill.
+- **Async scheduling**: Not supported; rejected at config time. The worker-side stop predicate reads `req_state.output_token_ids[-1]`, which under async scheduling is the placeholder `-1` until `AsyncGPUModelRunnerOutput` resolves the real sampled token, so EOS / stop-token finishes would silently drop routing. Use sync scheduling (the default when `--enable-return-routed-experts` is set, or set explicitly with the appropriate scheduler config).
+- **Sequence parallelism / naive DP MoE dispatch**: Not supported on the FusedMoE layer; rejected at bind time. SP shards `topk_ids` along dim 0 across the TP group so each rank only captures `1/sp_size` of the rows; naive DP dispatch all-gathers tokens across DP ranks before routing, so `topk_ids.shape[0]` exceeds the per-rank buffer size. Both raise `NotImplementedError` from `bind_routing_capture_to_model`.
+- **Pipeline / prefill-context / decode-context parallelism**: Not yet validated; rejected at config time.
+
+## CLI Reference
+
+| Flag                               | Description                                                            |
+|------------------------------------|------------------------------------------------------------------------|
+| `--enable-return-routed-experts`   | Enable routing replay capture and return expert IDs in API responses.  |
+
+## API Reference
+
+### Completions (`/v1/completions`)
+
+**Response-level field:**
+
+| Field                     | Type                                | Description                                                                 |
+|---------------------------|-------------------------------------|-----------------------------------------------------------------------------|
+| `prompt_routed_experts`   | `list[list[list[int]]]` or `null`   | Expert IDs for prompt tokens. Shape: `[prompt_len, num_moe_layers, top_k]`. |
+
+**Choice-level field:**
+
+| Field              | Type                                | Description                                                                   |
+|--------------------|-------------------------------------|-------------------------------------------------------------------------------|
+| `routed_experts`   | `list[list[list[int]]]` or `null`   | Expert IDs for generated tokens. Shape: `[gen_len, num_moe_layers, top_k]`.   |
+
+### Chat Completions (`/v1/chat/completions`)
+
+Same fields as above on `ChatCompletionResponse` and `ChatCompletionResponseChoice`.
+
+### Python SDK
+
+| Object               | Field                     | Type                     | Description                 |
+|----------------------|---------------------------|--------------------------|-----------------------------|
+| `RequestOutput`      | `prompt_routed_experts`   | `np.ndarray` or `None`   | `[prompt_len, L, K]` i16    |
+| `CompletionOutput`   | `routed_experts`          | `np.ndarray` or `None`   | `[gen_len, L, K]` int16     |
diff --git a/docs/usage/security.md b/docs/usage/security.md
index 300cabbfcc19..e548899abbf1 100644
--- a/docs/usage/security.md
+++ b/docs/usage/security.md
@@ -309,6 +309,30 @@ vLLM supports dynamically loading and unloading LoRA adapters at runtime via the
 
 **Warning:** Dynamic LoRA loading is not a secure operation and should not be enabled in deployments exposed to untrusted clients. If you must enable dynamic LoRA loading, restrict access to the `/v1/load_lora_adapter` and `/v1/unload_lora_adapter` endpoints to trusted administrators only, using a reverse proxy or network-level access controls. Do not expose these endpoints to end users. For details on configuring LoRA adapters, see the [LoRA Adapters documentation](../features/lora.md).
 
+## Cache Directory Security
+
+vLLM assumes that its cache directories are **private and trusted**. Cache contents are loaded without cryptographic integrity verification, including formats that support arbitrary code execution. If an untrusted user or process can write to vLLM's cache directories, they may be able to crash vLLM or cause it to execute arbitrary code.
+
+**Do not share vLLM cache directories with untrusted users or mount them from untrusted storage.** Treat the cache directory with the same care as the vLLM installation itself.
+
+### Cache Directory Configuration
+
+Most cache paths default to subdirectories under a single root. Changing `VLLM_CACHE_ROOT` changes the default location for all features that inherit from it. When `torch.compile` caching is enabled (the default), vLLM also redirects `TRITON_CACHE_DIR` into this tree. If compile caching is disabled, Triton falls back to its own default location (`~/.triton/cache`).
+
+| Environment Variable | Default | Description |
+| --- | --- | --- |
+| `VLLM_CACHE_ROOT` | `~/.cache/vllm` | Base cache directory. Respects `XDG_CACHE_HOME` if set. All paths below inherit from this unless explicitly overridden. |
+| *(torch.compile)* | `$VLLM_CACHE_ROOT/torch_compile_cache/` | Compilation cache for AOT-compiled models, Inductor graphs, and Triton kernels. Controlled by `VLLM_DISABLE_COMPILE_CACHE` (set to `1` to disable). |
+| `VLLM_ASSETS_CACHE` | `$VLLM_CACHE_ROOT/assets/` | Downloaded assets (e.g., tokenizer files). |
+| `VLLM_XLA_CACHE_PATH` | `$VLLM_CACHE_ROOT/xla_cache/` | XLA/TPU compilation cache. |
+| `VLLM_MEDIA_CACHE` | *(disabled)* | Optional cache for downloaded media (images, video, audio). Not enabled unless explicitly set. |
+
+### Recommendations
+
+- **Restrict file permissions** on `VLLM_CACHE_ROOT` (and any other cache directories used by dependencies, such as `~/.triton` if compile caching is disabled) so that only the vLLM process owner can read and write to them.
+- **Do not copy cache contents from untrusted sources.** If you distribute cache artifacts between environments, ensure they originate from a trusted build pipeline.
+- **Container deployments:** If mounting cache directories into containers, ensure the volume source is trusted.
+
 ## Reporting Security Vulnerabilities
 
 If you believe you have found a security vulnerability in vLLM, please report it following the project's security policy. For more information on how to report security issues and the project's security policy, please see the [vLLM Security Policy](https://github.com/vllm-project/vllm/blob/main/SECURITY.md).
diff --git a/examples/online_serving/api_client.py b/examples/applications/chatbot/api_client.py
similarity index 100%
rename from examples/online_serving/api_client.py
rename to examples/applications/chatbot/api_client.py
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/applications/chatbot/gradio_openai_chatbot_webserver.py
similarity index 97%
rename from examples/online_serving/gradio_openai_chatbot_webserver.py
rename to examples/applications/chatbot/gradio_openai_chatbot_webserver.py
index c76c60cc4472..2a67aefc0278 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/applications/chatbot/gradio_openai_chatbot_webserver.py
@@ -5,7 +5,7 @@
     vllm serve meta-llama/Llama-2-7b-chat-hf
 
 Start Gradio OpenAI Chatbot Webserver:
-    python examples/online_serving/gradio_openai_chatbot_webserver.py \
+    python examples/applications/chatbot/gradio_openai_chatbot_webserver.py \
                     -m meta-llama/Llama-2-7b-chat-hf
 
 Note that `pip install --upgrade gradio` is needed to run this example.
diff --git a/examples/online_serving/gradio_webserver.py b/examples/applications/chatbot/gradio_webserver.py
similarity index 97%
rename from examples/online_serving/gradio_webserver.py
rename to examples/applications/chatbot/gradio_webserver.py
index 86d9ceb48bb0..f75636409c2f 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/applications/chatbot/gradio_webserver.py
@@ -6,7 +6,7 @@
         --model meta-llama/Llama-2-7b-chat-hf
 
 Start Webserver:
-    python examples/online_serving/gradio_webserver.py
+    python examples/applications/chatbot/gradio_webserver.py
 
 Note that `pip install --upgrade gradio` is needed to run this example.
 More details: https://github.com/gradio-app/gradio
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/applications/chatbot/streamlit_openai_chatbot_webserver.py
similarity index 100%
rename from examples/online_serving/streamlit_openai_chatbot_webserver.py
rename to examples/applications/chatbot/streamlit_openai_chatbot_webserver.py
diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/applications/rag/retrieval_augmented_generation_with_langchain.py
similarity index 100%
rename from examples/online_serving/retrieval_augmented_generation_with_langchain.py
rename to examples/applications/rag/retrieval_augmented_generation_with_langchain.py
diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py
similarity index 100%
rename from examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
rename to examples/applications/rag/retrieval_augmented_generation_with_llamaindex.py
diff --git a/examples/offline_inference/async_llm_streaming.py b/examples/deployment/async_llm_streaming.py
similarity index 98%
rename from examples/offline_inference/async_llm_streaming.py
rename to examples/deployment/async_llm_streaming.py
index b876d536e3a1..ef69089a0460 100644
--- a/examples/offline_inference/async_llm_streaming.py
+++ b/examples/deployment/async_llm_streaming.py
@@ -8,7 +8,7 @@
 streaming where you receive new tokens as they are generated.
 
 Usage:
-    python examples/offline_inference/async_llm_streaming.py
+    python examples/deployment/async_llm_streaming.py
 """
 
 import asyncio
diff --git a/examples/online_serving/chart-helm/.helmignore b/examples/deployment/chart-helm/.helmignore
similarity index 100%
rename from examples/online_serving/chart-helm/.helmignore
rename to examples/deployment/chart-helm/.helmignore
diff --git a/examples/online_serving/chart-helm/Chart.yaml b/examples/deployment/chart-helm/Chart.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/Chart.yaml
rename to examples/deployment/chart-helm/Chart.yaml
diff --git a/examples/online_serving/chart-helm/README.md b/examples/deployment/chart-helm/README.md
similarity index 100%
rename from examples/online_serving/chart-helm/README.md
rename to examples/deployment/chart-helm/README.md
diff --git a/examples/online_serving/chart-helm/ct.yaml b/examples/deployment/chart-helm/ct.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/ct.yaml
rename to examples/deployment/chart-helm/ct.yaml
diff --git a/examples/online_serving/chart-helm/lintconf.yaml b/examples/deployment/chart-helm/lintconf.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/lintconf.yaml
rename to examples/deployment/chart-helm/lintconf.yaml
diff --git a/examples/online_serving/chart-helm/templates/_helpers.tpl b/examples/deployment/chart-helm/templates/_helpers.tpl
similarity index 100%
rename from examples/online_serving/chart-helm/templates/_helpers.tpl
rename to examples/deployment/chart-helm/templates/_helpers.tpl
diff --git a/examples/online_serving/chart-helm/templates/configmap.yaml b/examples/deployment/chart-helm/templates/configmap.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/configmap.yaml
rename to examples/deployment/chart-helm/templates/configmap.yaml
diff --git a/examples/online_serving/chart-helm/templates/custom-objects.yaml b/examples/deployment/chart-helm/templates/custom-objects.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/custom-objects.yaml
rename to examples/deployment/chart-helm/templates/custom-objects.yaml
diff --git a/examples/online_serving/chart-helm/templates/deployment.yaml b/examples/deployment/chart-helm/templates/deployment.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/deployment.yaml
rename to examples/deployment/chart-helm/templates/deployment.yaml
diff --git a/examples/online_serving/chart-helm/templates/hpa.yaml b/examples/deployment/chart-helm/templates/hpa.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/hpa.yaml
rename to examples/deployment/chart-helm/templates/hpa.yaml
diff --git a/examples/online_serving/chart-helm/templates/job.yaml b/examples/deployment/chart-helm/templates/job.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/job.yaml
rename to examples/deployment/chart-helm/templates/job.yaml
diff --git a/examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml b/examples/deployment/chart-helm/templates/poddisruptionbudget.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/poddisruptionbudget.yaml
rename to examples/deployment/chart-helm/templates/poddisruptionbudget.yaml
diff --git a/examples/online_serving/chart-helm/templates/pvc.yaml b/examples/deployment/chart-helm/templates/pvc.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/pvc.yaml
rename to examples/deployment/chart-helm/templates/pvc.yaml
diff --git a/examples/online_serving/chart-helm/templates/secrets.yaml b/examples/deployment/chart-helm/templates/secrets.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/secrets.yaml
rename to examples/deployment/chart-helm/templates/secrets.yaml
diff --git a/examples/online_serving/chart-helm/templates/service.yaml b/examples/deployment/chart-helm/templates/service.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/templates/service.yaml
rename to examples/deployment/chart-helm/templates/service.yaml
diff --git a/examples/online_serving/chart-helm/tests/deployment_test.yaml b/examples/deployment/chart-helm/tests/deployment_test.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/tests/deployment_test.yaml
rename to examples/deployment/chart-helm/tests/deployment_test.yaml
diff --git a/examples/online_serving/chart-helm/tests/job_test.yaml b/examples/deployment/chart-helm/tests/job_test.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/tests/job_test.yaml
rename to examples/deployment/chart-helm/tests/job_test.yaml
diff --git a/examples/online_serving/chart-helm/tests/pvc_test.yaml b/examples/deployment/chart-helm/tests/pvc_test.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/tests/pvc_test.yaml
rename to examples/deployment/chart-helm/tests/pvc_test.yaml
diff --git a/examples/online_serving/chart-helm/values.schema.json b/examples/deployment/chart-helm/values.schema.json
similarity index 100%
rename from examples/online_serving/chart-helm/values.schema.json
rename to examples/deployment/chart-helm/values.schema.json
diff --git a/examples/online_serving/chart-helm/values.yaml b/examples/deployment/chart-helm/values.yaml
similarity index 100%
rename from examples/online_serving/chart-helm/values.yaml
rename to examples/deployment/chart-helm/values.yaml
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/deployment/llm_engine_example.py
similarity index 100%
rename from examples/offline_inference/llm_engine_example.py
rename to examples/deployment/llm_engine_example.py
diff --git a/examples/online_serving/sagemaker-entrypoint.sh b/examples/deployment/sagemaker-entrypoint.sh
similarity index 100%
rename from examples/online_serving/sagemaker-entrypoint.sh
rename to examples/deployment/sagemaker-entrypoint.sh
diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/disaggregated/disaggregated_encoder/README.md
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/README.md
rename to examples/disaggregated/disaggregated_encoder/README.md
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh
rename to examples/disaggregated/disaggregated_encoder/disagg_1e1p1d_example.sh
diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
rename to examples/disaggregated/disaggregated_encoder/disagg_1e1pd_example.sh
diff --git a/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py b/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py
similarity index 100%
rename from examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py
rename to examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/disaggregated/disaggregated_prefill.py
similarity index 100%
rename from examples/offline_inference/disaggregated_prefill.py
rename to examples/disaggregated/disaggregated_prefill.py
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/disaggregated/disaggregated_prefill.sh
similarity index 100%
rename from examples/online_serving/disaggregated_prefill.sh
rename to examples/disaggregated/disaggregated_prefill.sh
diff --git a/examples/online_serving/disaggregated_serving/README.md b/examples/disaggregated/disaggregated_serving/README.md
similarity index 100%
rename from examples/online_serving/disaggregated_serving/README.md
rename to examples/disaggregated/disaggregated_serving/README.md
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
similarity index 99%
rename from examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
rename to examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
index 763361a30e02..57deef6a15d2 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py
@@ -5,7 +5,7 @@
 example usage of XpYd disaggregated prefilling.
 We can launch multiple vllm instances (2 for prefill and 2 for decode), and
 launch this proxy demo through:
-  python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py  \
+  python3 examples/disaggregated/disaggregated_serving/disagg_proxy_demo.py  \
        --model $model_name  \
        --prefill localhost:8100 localhost:8101   \
        --decode localhost:8200 localhost:8201   \
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py b/examples/disaggregated/disaggregated_serving/disagg_proxy_multiturn.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/disagg_proxy_multiturn.py
rename to examples/disaggregated/disaggregated_serving/disagg_proxy_multiturn.py
diff --git a/examples/online_serving/disaggregated_serving/example_mm_serve.py b/examples/disaggregated/disaggregated_serving/example_mm_serve.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/example_mm_serve.py
rename to examples/disaggregated/disaggregated_serving/example_mm_serve.py
diff --git a/examples/online_serving/disaggregated_serving/kv_events.sh b/examples/disaggregated/disaggregated_serving/kv_events.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving/kv_events.sh
rename to examples/disaggregated/disaggregated_serving/kv_events.sh
diff --git a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py b/examples/disaggregated/disaggregated_serving/moriio_toy_proxy_server.py
similarity index 84%
rename from examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
rename to examples/disaggregated/disaggregated_serving/moriio_toy_proxy_server.py
index de4757f36b70..aceb7a9b81c3 100644
--- a/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py
+++ b/examples/disaggregated/disaggregated_serving/moriio_toy_proxy_server.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
 import asyncio
 import copy
 import logging
@@ -7,6 +8,7 @@
 import socket
 import threading
 import uuid
+from urllib.parse import urlparse
 
 import aiohttp
 import msgpack
@@ -336,11 +338,81 @@ async def handle_request(api: str, request: Request):
         )
 
 
+async def send_profile_cmd(req_data: dict, profiler_cmd: str):
+    assert profiler_cmd in {"start", "stop"}
+
+    with _list_lock:
+        p_instances = list(prefill_instances)
+        d_instances = list(decode_instances)
+
+    if not p_instances and not d_instances:
+        raise RuntimeError(
+            "Service Unavailable: No prefill or decode instances are registered."
+        )
+
+    headers = {
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    tasks = []
+
+    async with aiohttp.ClientSession(
+        timeout=aiohttp.ClientTimeout(total=60)
+    ) as session:
+        for instances in (p_instances, d_instances):
+            for inst in instances:
+                _p = urlparse(inst["request_address"])
+                url = f"http://{_p.hostname}:{_p.port}/{profiler_cmd}_profile"
+
+                tasks.append(
+                    session.post(
+                        url,
+                        json=req_data,
+                        headers=headers,
+                    )
+                )
+
+        responses = await asyncio.gather(*tasks, return_exceptions=True)
+
+        for r in responses:
+            if isinstance(r, Exception):
+                raise r
+            if r.status >= 400:
+                msg = await r.text()
+                raise RuntimeError(f"{profiler_cmd}_profile failed: {r.status}, {msg}")
+
+        return await responses[0].json()
+
+
+@app.post("/start_profile")
+async def start_profile():
+    try:
+        req_data = await request.get_json()
+        return await send_profile_cmd(req_data, "start")
+    except Exception as e:
+        logger.exception("start_profile failed: %s", e)
+        return await make_response((str(e), 500))
+
+
+@app.post("/stop_profile")
+async def stop_profile():
+    try:
+        req_data = await request.get_json()
+        return await send_profile_cmd(req_data, "stop")
+    except Exception as e:
+        logger.exception("stop_profile failed: %s", e)
+        return await make_response((str(e), 500))
+
+
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=10001)
+    args = parser.parse_args()
+
     t = start_service_discovery("0.0.0.0", 36367)
     app.debug = True
     app.config["BODY_TIMEOUT"] = 360000
     app.config["RESPONSE_TIMEOUT"] = 360000
 
-    app.run(host="0.0.0.0", port=10001)
+    app.run(host="0.0.0.0", port=args.port)
     t.join()
diff --git a/examples/online_serving/ec_both_encoder/ec_both_encoder.sh b/examples/disaggregated/ec_both_encoder/ec_both_encoder.sh
old mode 100755
new mode 100644
similarity index 100%
rename from examples/online_serving/ec_both_encoder/ec_both_encoder.sh
rename to examples/disaggregated/ec_both_encoder/ec_both_encoder.sh
diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/disaggregated/example_connector/README.md
similarity index 81%
rename from examples/offline_inference/disaggregated-prefill-v1/README.md
rename to examples/disaggregated/example_connector/README.md
index abf6883f8d3e..43f16223896c 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/disaggregated/example_connector/README.md
@@ -5,6 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
-    - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
+    - Make sure you are in the `examples/disaggregated/example_connector` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/disaggregated/example_connector/decode_example.py
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/decode_example.py
rename to examples/disaggregated/example_connector/decode_example.py
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/disaggregated/example_connector/prefill_example.py
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
rename to examples/disaggregated/example_connector/prefill_example.py
diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/disaggregated/example_connector/run.sh
similarity index 100%
rename from examples/offline_inference/disaggregated-prefill-v1/run.sh
rename to examples/disaggregated/example_connector/run.sh
diff --git a/examples/offline_inference/prefix_caching_flexkv.py b/examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
similarity index 98%
rename from examples/offline_inference/prefix_caching_flexkv.py
rename to examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
index f2ffb75ef845..b67c2459319f 100644
--- a/examples/offline_inference/prefix_caching_flexkv.py
+++ b/examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py
@@ -14,7 +14,7 @@
 
 Usage:
     1. Run this script:
-       python examples/offline_inference/prefix_caching_flexkv.py \
+       python examples/disaggregated/flexkv_connector/prefix_caching_flexkv.py \
            --model /path/to/your/model
 
     2. Arguments:
diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/disaggregated/kv_load_failure_recovery_offline/README.md
similarity index 91%
rename from examples/offline_inference/kv_load_failure_recovery/README.md
rename to examples/disaggregated/kv_load_failure_recovery_offline/README.md
index 176141b5de4a..7205b2135ebb 100644
--- a/examples/offline_inference/kv_load_failure_recovery/README.md
+++ b/examples/disaggregated/kv_load_failure_recovery_offline/README.md
@@ -1,12 +1,12 @@
 # KV Load Failure Recovery Test
 
-This example builds upon the `disaggregated-prefill-v1` example in `examples/offline_inference`.
+This example builds upon the `example_connector` example in `examples/disaggregated`.
 
 It demonstrates vLLM's ability to recover from KV load failures in both synchronous and asynchronous loading modes. The goal is to verify that vLLM correctly identifies invalid KV blocks, reschedules the affected requests, and ensures successful and consistent output.
 
 ## Files
 
-- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `disaggregated-prefill-v1`).
+- `prefill_example.py` – performs the prefill stage and saves KV data (same as in `example_connector`).
 - `decode_example.py` – performs the decode stage. Accepts:
     - `--simulate-failure`: simulates KV load failure using a custom connector.
     - `--async-load`: enables asynchronous KV loading mode.
diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/disaggregated/kv_load_failure_recovery_offline/decode_example.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/decode_example.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/decode_example.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py b/examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/load_recovery_example_connector.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/disaggregated/kv_load_failure_recovery_offline/prefill_example.py
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/prefill_example.py
rename to examples/disaggregated/kv_load_failure_recovery_offline/prefill_example.py
diff --git a/examples/offline_inference/kv_load_failure_recovery/run.sh b/examples/disaggregated/kv_load_failure_recovery_offline/run.sh
old mode 100755
new mode 100644
similarity index 100%
rename from examples/offline_inference/kv_load_failure_recovery/run.sh
rename to examples/disaggregated/kv_load_failure_recovery_offline/run.sh
diff --git a/examples/others/lmcache/README.md b/examples/disaggregated/lmcache/README.md
similarity index 100%
rename from examples/others/lmcache/README.md
rename to examples/disaggregated/lmcache/README.md
diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/disaggregated/lmcache/cpu_offload_lmcache.py
similarity index 100%
rename from examples/others/lmcache/cpu_offload_lmcache.py
rename to examples/disaggregated/lmcache/cpu_offload_lmcache.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v0.py
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v0.py
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v0.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
similarity index 100%
rename from examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
rename to examples/disaggregated/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/disaggregated/lmcache/kv_cache_sharing_lmcache_v1.py
similarity index 100%
rename from examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
rename to examples/disaggregated/lmcache/kv_cache_sharing_lmcache_v1.py
diff --git a/examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py b/examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving/mooncake_connector/mooncake_connector_proxy.py
rename to examples/disaggregated/mooncake_connector/mooncake_connector_proxy.py
diff --git a/examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh b/examples/disaggregated/mooncake_connector/run_mooncake_connector.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh
rename to examples/disaggregated/mooncake_connector/run_mooncake_connector.sh
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh b/examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
similarity index 100%
rename from examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
rename to examples/disaggregated/p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
diff --git a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py b/examples/disaggregated/p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
similarity index 100%
rename from examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
rename to examples/disaggregated/p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
diff --git a/examples/others/logging_configuration.md b/examples/features/logging_configuration.md
similarity index 100%
rename from examples/others/logging_configuration.md
rename to examples/features/logging_configuration.md
diff --git a/examples/others/tensorize_vllm_model.py b/examples/features/tensorize_vllm_model.py
similarity index 97%
rename from examples/others/tensorize_vllm_model.py
rename to examples/features/tensorize_vllm_model.py
index 3644a03b32ed..a89b1781264d 100644
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/features/tensorize_vllm_model.py
@@ -33,7 +33,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -53,7 +53,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python examples/others/tensorize_vllm_model.py \
+python examples/features/tensorize_vllm_model.py \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -71,11 +71,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.others.tensorize_vllm_model serialize --help`.
+`python -m examples.features.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python examples/others/tensorize_vllm_model.py deserialize --help`.
+`python examples/features/tensorize_vllm_model.py deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -100,7 +100,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python examples/others/tensorize_vllm_model.py deserialize --help`
+`python examples/features/tensorize_vllm_model.py deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
diff --git a/examples/generate/multimodal/vision_language_offline.py b/examples/generate/multimodal/vision_language_offline.py
index 87d42c036ec1..794f20dd0a52 100644
--- a/examples/generate/multimodal/vision_language_offline.py
+++ b/examples/generate/multimodal/vision_language_offline.py
@@ -2466,6 +2466,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 MODELS_SUPPORT_VIT_CUDA_GRAPH = [
     "qwen3_vl",
     "qwen3_vl_moe",
+    "qwen2_5_vl",
 ]
 
 
diff --git a/examples/observability/dashboards/README.md b/examples/observability/dashboards/README.md
index e5f5010a42a4..29ec932cef24 100644
--- a/examples/observability/dashboards/README.md
+++ b/examples/observability/dashboards/README.md
@@ -43,7 +43,7 @@ Both platforms provide equivalent monitoring capabilities:
 First, navigate to this example's directory:
 
 ```bash
-cd examples/online_serving/dashboards
+cd examples/observability/dashboards
 ```
 
 ### Grafana
diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
deleted file mode 100644
index a512d8a31b53..000000000000
--- a/examples/online_serving/utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from openai import APIConnectionError, OpenAI
-from openai.pagination import SyncPage
-from openai.types.model import Model
-
-
-def get_first_model(client: OpenAI) -> str:
-    """
-    Get the first model from the vLLM server.
-    """
-    try:
-        models: SyncPage[Model] = client.models.list()
-    except APIConnectionError as e:
-        raise RuntimeError(
-            "Failed to get the list of models from the vLLM server at "
-            f"{client.base_url} with API key {client.api_key}. Check\n"
-            "1. the server is running\n"
-            "2. the server URL is correct\n"
-            "3. the API key is correct"
-        ) from e
-
-    if len(models.data) == 0:
-        raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
-
-    return models.data[0].id
diff --git a/examples/pooling/embed/openai_embedding_long_text/service.sh b/examples/pooling/embed/openai_embedding_long_text/service.sh
index 37a8b625b7f9..68950d04ee14 100644
--- a/examples/pooling/embed/openai_embedding_long_text/service.sh
+++ b/examples/pooling/embed/openai_embedding_long_text/service.sh
@@ -119,7 +119,7 @@ echo "   - API Key: $API_KEY"
 echo "   - Native Pooling: $POOLING_TYPE | Cross-chunk: MEAN"
 echo ""
 echo "🧪 Test the server with:"
-echo "   python examples/online_serving/openai_embedding_long_text/client.py"
+echo "   python examples/pooling/embed/openai_embedding_long_text/client.py"
 echo ""
 echo "📚 Enhanced features enabled:"
 echo "   ✅ Intelligent native pooling type detection"
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/ray_serving/batch_llm_inference.py
similarity index 100%
rename from examples/offline_inference/batch_llm_inference.py
rename to examples/ray_serving/batch_llm_inference.py
diff --git a/examples/online_serving/elastic_ep/bench.sh b/examples/ray_serving/elastic_ep/bench.sh
similarity index 100%
rename from examples/online_serving/elastic_ep/bench.sh
rename to examples/ray_serving/elastic_ep/bench.sh
diff --git a/examples/online_serving/elastic_ep/scale.py b/examples/ray_serving/elastic_ep/scale.py
similarity index 100%
rename from examples/online_serving/elastic_ep/scale.py
rename to examples/ray_serving/elastic_ep/scale.py
diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/ray_serving/elastic_ep/serve_deepseek_v2.sh
similarity index 100%
rename from examples/online_serving/elastic_ep/serve_deepseek_v2.sh
rename to examples/ray_serving/elastic_ep/serve_deepseek_v2.sh
diff --git a/examples/online_serving/multi-node-serving.sh b/examples/ray_serving/multi-node-serving.sh
similarity index 100%
rename from examples/online_serving/multi-node-serving.sh
rename to examples/ray_serving/multi-node-serving.sh
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/ray_serving/ray_serve_deepseek.py
similarity index 100%
rename from examples/online_serving/ray_serve_deepseek.py
rename to examples/ray_serving/ray_serve_deepseek.py
diff --git a/examples/online_serving/run_cluster.sh b/examples/ray_serving/run_cluster.sh
similarity index 100%
rename from examples/online_serving/run_cluster.sh
rename to examples/ray_serving/run_cluster.sh
diff --git a/examples/speech_to_text/lid/openai_lid_client.py b/examples/speech_to_text/lid/openai_lid_client.py
index 0ce0fbc92250..d91df3298c22 100644
--- a/examples/speech_to_text/lid/openai_lid_client.py
+++ b/examples/speech_to_text/lid/openai_lid_client.py
@@ -15,14 +15,14 @@
 Then run this script:
 
     # Use the built-in sample audio
-    python examples/online_serving/openai_lid_client.py
+    python examples/speech_to_text/lid/openai_lid_client.py
 
     # Use your own audio file(s)
-    python examples/online_serving/openai_lid_client.py \
+    python examples/speech_to_text/lid/openai_lid_client.py \
         --audio_paths audio_en.wav audio_zh.wav audio_fr.wav
 
     # Batch-identify multiple files in one run
-    python examples/online_serving/openai_lid_client.py \
+    python examples/speech_to_text/lid/openai_lid_client.py \
         --audio_paths /path/to/dir/*.wav
 
 Requirements:
diff --git a/examples/tool_chat_template_gemma4.jinja b/examples/tool_chat_template_gemma4.jinja
index 15c5238ac332..f62ca843a405 100644
--- a/examples/tool_chat_template_gemma4.jinja
+++ b/examples/tool_chat_template_gemma4.jinja
@@ -1,9 +1,9 @@
-{%- macro format_parameters(properties, required) -%}
+{%- macro format_parameters(properties, required, filter_keys=false) -%}
     {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
     {%- set ns = namespace(found_first=false) -%}
     {%- for key, value in properties | dictsort -%}
         {%- set add_comma = false -%}
-        {%- if key not in standard_keys -%}
+        {%- if not filter_keys or key not in standard_keys -%}
             {%- if ns.found_first %},{% endif -%}
             {%- set ns.found_first = true -%}
             {{ key }}:{
@@ -11,34 +11,15 @@
                 description:<|"|>{{ value['description'] }}<|"|>
                 {%- set add_comma = true -%}
             {%- endif -%}
-            {%- if value['nullable'] %}
-                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
-                nullable:true
-            {%- endif -%}
             {%- if value['type'] | upper == 'STRING' -%}
                 {%- if value['enum'] -%}
                     {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
                     enum:{{ format_argument(value['enum']) }}
                 {%- endif -%}
-            {%- elif value['type'] | upper == 'OBJECT' -%}
-                ,properties:{
-                {%- if value['properties'] is defined and value['properties'] is mapping -%}
-                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
-                {%- elif value is mapping -%}
-                    {{- format_parameters(value, value['required'] | default([])) -}}
-                {%- endif -%}
-                }
-                {%- if value['required'] -%}
-                    ,required:[
-                    {%- for item in value['required'] | default([]) -%}
-                        <|"|>{{- item -}}<|"|>
-                        {%- if not loop.last %},{% endif -%}
-                    {%- endfor -%}
-                    ]
-                {%- endif -%}
             {%- elif value['type'] | upper == 'ARRAY' -%}
                 {%- if value['items'] is mapping and value['items'] -%}
-                    ,items:{
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    items:{
                     {%- set ns_items = namespace(found_first=false) -%}
                     {%- for item_key, item_value in value['items'] | dictsort -%}
                         {%- if item_value is not none -%}
@@ -71,6 +52,32 @@
                     }
                 {%- endif -%}
             {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'OBJECT' -%}
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                    }
+                {%- elif value is mapping -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    properties:{
+                    {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
+                    }
+                {%- endif -%}
+                {%- if value['required'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- endif -%}
             {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
             type:<|"|>{{ value['type'] | upper }}<|"|>}
         {%- endif -%}
@@ -167,20 +174,25 @@
 
 {%- set ns = namespace(prev_message_type=None) -%}
 {%- set loop_messages = messages -%}
-{{ bos_token }}
+{{- bos_token -}}
+{#- Handle System/Tool Definitions Block -#}
 {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
     {{- '<|turn>system\n' -}}
-
+    {#- Inject Thinking token at the very top of the FIRST system turn -#}
     {%- if enable_thinking is defined and enable_thinking -%}
-        {{- '<|think|>' -}}
+        {{- '<|think|>\n' -}}
         {%- set ns.prev_message_type = 'think' -%}
     {%- endif -%}
-
     {%- if messages[0]['role'] in ['system', 'developer'] -%}
-        {{- messages[0]['content'] | trim -}}
+        {%- if messages[0]['content'] is string -%}
+            {{- messages[0]['content'] | trim -}}
+        {%- elif messages[0]['content'] is sequence -%}
+            {%- for item in messages[0]['content'] -%}
+                {{- item['text'] | trim + ' '-}}
+            {%- endfor -%}
+        {%- endif -%}
         {%- set loop_messages = messages[1:] -%}
     {%- endif -%}
-
     {%- if tools -%}
         {%- for tool in tools %}
             {{- '<|tool>' -}}
@@ -189,10 +201,10 @@
         {%- endfor %}
         {%- set ns.prev_message_type = 'tool' -%}
     {%- endif -%}
-
     {{- '<turn|>\n' -}}
 {%- endif %}
 
+{#- Pre-scan: find last user message index for reasoning guard -#}
 {%- set ns_turn = namespace(last_user_idx=-1) -%}
 {%- for i in range(loop_messages | length) -%}
     {%- if loop_messages[i]['role'] == 'user' -%}
@@ -200,12 +212,12 @@
     {%- endif -%}
 {%- endfor -%}
 
+{#- Loop through messages -#}
 {%- for message in loop_messages -%}
     {%- if message['role'] != 'tool' -%}
     {%- set ns.prev_message_type = None -%}
     {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
-    {#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
-        Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
+    {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
     {%- set prev_nt = namespace(role=None, found=false) -%}
     {%- if loop.index0 > 0 -%}
         {%- for j in range(loop.index0 - 1, -1, -1) -%}
@@ -222,8 +234,10 @@
         {{- '<|turn>' + role + '\n' }}
     {%- endif -%}
 
-    {%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
-        {{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
+    {#- Render reasoning/reasoning_content as thinking channel -#}
+    {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
+    {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
     {%- endif -%}
 
             {%- if message['tool_calls'] -%}
@@ -247,14 +261,14 @@
 
             {%- set ns_tr_out = namespace(flag=false) -%}
             {%- if message.get('tool_responses') -%}
-                {#- Legacy: tool_responses embedded on the assistant message -#}
+                {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
                 {%- for tool_response in message['tool_responses'] -%}
                     {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
                     {%- set ns_tr_out.flag = true -%}
                     {%- set ns.prev_message_type = 'tool_response' -%}
                 {%- endfor -%}
             {%- elif message.get('tool_calls') -%}
-                {#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
+                {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
                 {%- set ns_tool_scan = namespace(stopped=false) -%}
                 {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
                     {%- if ns_tool_scan.stopped -%}
@@ -262,12 +276,14 @@
                         {%- set ns_tool_scan.stopped = true -%}
                     {%- else -%}
                         {%- set follow = loop_messages[k] -%}
+                        {#- Resolve tool_call_id to function name -#}
                         {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
                         {%- for tc in message['tool_calls'] -%}
                             {%- if tc.get('id') == follow.get('tool_call_id') -%}
                                 {%- set ns_tname.name = tc['function']['name'] -%}
                             {%- endif -%}
                         {%- endfor -%}
+                        {#- Handle content as string or content-parts array -#}
                         {%- set tool_body = follow.get('content') -%}
                         {%- if tool_body is string -%}
                             {{- format_tool_response_block(ns_tname.name, tool_body) -}}
@@ -288,6 +304,7 @@
                 {%- endfor -%}
             {%- endif -%}
 
+            {%- set captured_content -%}
             {%- if message['content'] is string -%}
                 {%- if role == 'model' -%}
                     {{- strip_thinking(message['content']) -}}
@@ -303,29 +320,35 @@
                             {{- item['text'] | trim -}}
                         {%- endif -%}
                     {%- elif item['type'] == 'image' -%}
-                        {{- '\n\n<|image|>\n\n' -}}
+                        {{- '<|image|>' -}}
                         {%- set ns.prev_message_type = 'image' -%}
                     {%- elif item['type'] == 'audio' -%}
                         {{- '<|audio|>' -}}
                         {%- set ns.prev_message_type = 'audio' -%}
                     {%- elif item['type'] == 'video' -%}
-                        {{- '\n\n<|video|>\n\n' -}}
+                        {{- '<|video|>' -}}
                         {%- set ns.prev_message_type = 'video' -%}
                     {%- endif -%}
                 {%- endfor -%}
             {%- endif -%}
+            {%- endset -%}
+
+            {{- captured_content -}}
+            {%- set has_content = captured_content | trim | length > 0 -%}
 
-        {%- if not (ns_tr_out.flag and not message.get('content')) -%}
+        {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
+            {{- '<|tool_response>' -}}
+        {%- elif not (ns_tr_out.flag and not has_content) -%}
             {{- '<turn|>\n' -}}
         {%- endif -%}
     {%- endif -%}
 {%- endfor -%}
 
 {%- if add_generation_prompt -%}
-    {%- if ns.prev_message_type != 'tool_response' -%}
+    {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
         {{- '<|turn>model\n' -}}
+        {%- if not enable_thinking | default(false) -%}
+            {{- '<|channel>thought\n<channel|>' -}}
+        {%- endif -%}
     {%- endif -%}
-    {%- if not enable_thinking | default(false) -%}
-        {{- '<|channel>thought\n<channel|>' -}}
-    {%- endif -%}
-{%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 6afc44d71af5..4b06b31ebe35 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -105,8 +105,7 @@ plugins:
           - https://docs.aiohttp.org/en/stable/objects.inv
           - https://pillow.readthedocs.io/en/stable/objects.inv
           - https://numpy.org/doc/stable/objects.inv
-          # TODO revert to stable once https://github.com/pytorch/pytorch/issues/182007 is fixed
-          - https://pytorch.org/docs/2.11/objects.inv
+          - https://pytorch.org/docs/stable/objects.inv
   - redirects:
       redirect_maps:
         features/spec_decode/README.md: features/speculative_decoding/README.md
diff --git a/pyproject.toml b/pyproject.toml
index b8d14463256d..6a0ae2fea6eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -125,7 +125,7 @@ extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizer
     "examples/pooling/token_embed/*", "tests/models/language/pooling/*",
     "vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/speech_to_text/test_transcription_validation.py",
     "docs/governance/process.md", "docs/assets/contributing/vllm_bench_serve_timeline.html", 
-    "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
+    "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*", "csrc/cpu/sgl-kernels/*"]
 ignore-hidden = false
 
 [tool.typos.default]
diff --git a/requirements/common.txt b/requirements/common.txt
index 5d4519204ee9..843170325240 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -24,14 +24,14 @@ outlines_core == 0.2.14
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar >= 0.1.32, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
+xgrammar >= 0.2.0, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
 gguf >= 0.17.0
-mistral_common[image] >= 1.11.0
+mistral_common[image] >= 1.11.2
 opencv-python-headless >= 4.13.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@@ -41,7 +41,7 @@ compressed-tensors == 0.15.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
-python-json-logger # Used by logging as per examples/others/logging_configuration.md
+python-json-logger # Used by logging as per examples/features/logging_configuration.md
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
@@ -49,7 +49,7 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic >= 0.71.0
-model-hosting-container-standards >= 0.1.13, < 1.0.0
+model-hosting-container-standards >= 0.1.14, < 1.0.0
 mcp
 opentelemetry-sdk >= 1.27.0
 opentelemetry-api >= 1.27.0
diff --git a/requirements/docs.in b/requirements/docs.in
new file mode 100644
index 000000000000..952e7c09bae9
--- /dev/null
+++ b/requirements/docs.in
@@ -0,0 +1,17 @@
+mkdocs<2.0.0
+mkdocs-api-autonav
+mkdocs-material
+mkdocstrings-python
+mkdocs-gen-files
+mkdocs-awesome-nav
+mkdocs-glightbox
+mkdocs-git-revision-date-localized-plugin
+mkdocs-minify-plugin
+mkdocs-redirects
+regex
+ruff
+pydantic
+
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 952e7c09bae9..5331a3b79b7d 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,17 +1,182 @@
-mkdocs<2.0.0
-mkdocs-api-autonav
-mkdocs-material
-mkdocstrings-python
-mkdocs-gen-files
-mkdocs-awesome-nav
-mkdocs-glightbox
-mkdocs-git-revision-date-localized-plugin
-mkdocs-minify-plugin
-mkdocs-redirects
-regex
-ruff
-pydantic
-
-# For generating argparse docs.
-# Adding requirements here should only be used as a last resort.
-msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements/docs.in -o requirements/docs.txt --python-platform x86_64-manylinux_2_28 --python-version 3.12
+annotated-types==0.7.0
+    # via pydantic
+babel==2.18.0
+    # via
+    #   mkdocs-git-revision-date-localized-plugin
+    #   mkdocs-material
+backrefs==7.0
+    # via mkdocs-material
+bracex==2.6
+    # via wcmatch
+certifi==2026.4.22
+    # via requests
+charset-normalizer==3.4.7
+    # via requests
+click==8.3.3
+    # via
+    #   mkdocs
+    #   properdocs
+colorama==0.4.6
+    # via mkdocs-material
+csscompressor==0.9.5
+    # via mkdocs-minify-plugin
+ghp-import==2.1.0
+    # via
+    #   mkdocs
+    #   properdocs
+gitdb==4.0.12
+    # via gitpython
+gitpython==3.1.50
+    # via mkdocs-git-revision-date-localized-plugin
+griffelib==2.0.2
+    # via mkdocstrings-python
+htmlmin2==0.1.13
+    # via mkdocs-minify-plugin
+idna==3.13
+    # via requests
+jinja2==3.1.6
+    # via
+    #   mkdocs
+    #   mkdocs-material
+    #   mkdocstrings
+    #   properdocs
+jsmin==3.0.1
+    # via mkdocs-minify-plugin
+markdown==3.10.2
+    # via
+    #   mkdocs
+    #   mkdocs-autorefs
+    #   mkdocs-material
+    #   mkdocstrings
+    #   properdocs
+    #   pymdown-extensions
+markupsafe==3.0.3
+    # via
+    #   jinja2
+    #   mkdocs
+    #   mkdocs-autorefs
+    #   mkdocstrings
+    #   properdocs
+mergedeep==1.3.4
+    # via
+    #   mkdocs
+    #   mkdocs-get-deps
+mkdocs==1.6.1
+    # via
+    #   -r requirements/docs.in
+    #   mkdocs-api-autonav
+    #   mkdocs-autorefs
+    #   mkdocs-awesome-nav
+    #   mkdocs-gen-files
+    #   mkdocs-git-revision-date-localized-plugin
+    #   mkdocs-material
+    #   mkdocs-minify-plugin
+    #   mkdocs-redirects
+    #   mkdocstrings
+mkdocs-api-autonav==0.4.0
+    # via -r requirements/docs.in
+mkdocs-autorefs==1.4.4
+    # via
+    #   mkdocstrings
+    #   mkdocstrings-python
+mkdocs-awesome-nav==3.3.0
+    # via -r requirements/docs.in
+mkdocs-gen-files==0.6.1
+    # via -r requirements/docs.in
+mkdocs-get-deps==0.2.2
+    # via mkdocs
+mkdocs-git-revision-date-localized-plugin==1.5.1
+    # via -r requirements/docs.in
+mkdocs-glightbox==0.5.2
+    # via -r requirements/docs.in
+mkdocs-material==9.7.6
+    # via -r requirements/docs.in
+mkdocs-material-extensions==1.3.1
+    # via mkdocs-material
+mkdocs-minify-plugin==0.8.0
+    # via -r requirements/docs.in
+mkdocs-redirects==1.2.3
+    # via -r requirements/docs.in
+mkdocstrings==1.0.4
+    # via mkdocstrings-python
+mkdocstrings-python==2.0.3
+    # via
+    #   -r requirements/docs.in
+    #   mkdocs-api-autonav
+msgspec==0.21.1
+    # via -r requirements/docs.in
+natsort==8.4.0
+    # via mkdocs-awesome-nav
+packaging==26.2
+    # via
+    #   mkdocs
+    #   properdocs
+paginate==0.5.7
+    # via mkdocs-material
+pathspec==1.1.1
+    # via
+    #   mkdocs
+    #   properdocs
+platformdirs==4.9.6
+    # via
+    #   mkdocs-get-deps
+    #   properdocs
+properdocs==1.6.7
+    # via
+    #   mkdocs-gen-files
+    #   mkdocs-redirects
+pydantic==2.13.4
+    # via
+    #   -r requirements/docs.in
+    #   mkdocs-awesome-nav
+pydantic-core==2.46.4
+    # via pydantic
+pygments==2.20.0
+    # via mkdocs-material
+pymdown-extensions==10.21.2
+    # via
+    #   mkdocs-material
+    #   mkdocstrings
+python-dateutil==2.9.0.post0
+    # via ghp-import
+pyyaml==6.0.3
+    # via
+    #   mkdocs
+    #   mkdocs-api-autonav
+    #   mkdocs-get-deps
+    #   properdocs
+    #   pymdown-extensions
+    #   pyyaml-env-tag
+pyyaml-env-tag==1.1
+    # via
+    #   mkdocs
+    #   properdocs
+regex==2026.4.4
+    # via -r requirements/docs.in
+requests==2.33.1
+    # via mkdocs-material
+ruff==0.15.12
+    # via -r requirements/docs.in
+selectolax==0.4.8
+    # via mkdocs-glightbox
+six==1.17.0
+    # via python-dateutil
+smmap==5.0.3
+    # via gitdb
+typing-extensions==4.15.0
+    # via
+    #   pydantic
+    #   pydantic-core
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via pydantic
+urllib3==2.6.3
+    # via requests
+watchdog==6.0.0
+    # via
+    #   mkdocs
+    #   properdocs
+wcmatch==10.1
+    # via mkdocs-awesome-nav
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 0b472b90c026..037b20874b52 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -21,3 +21,6 @@ timm>=1.0.17
 # amd-quark: required for Quark quantization on ROCm 
 # To be consistent with test_quark.py
 amd-quark>=0.8.99
+# tilelang has to be installed for mhc module to be
+# imported correctly.
+tilelang==0.1.9
diff --git a/requirements/test/cuda.in b/requirements/test/cuda.in
index 60be48b00be5..71e496ccf650 100644
--- a/requirements/test/cuda.in
+++ b/requirements/test/cuda.in
@@ -31,7 +31,7 @@ torchaudio==2.11.0
 torchvision==0.26.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.11.0 # required for voxtral test
+mistral_common[image,audio] >= 1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless >= 4.13.0 # required for video test
diff --git a/requirements/test/cuda.txt b/requirements/test/cuda.txt
index 5c2a26d3dc92..8544f7b70969 100644
--- a/requirements/test/cuda.txt
+++ b/requirements/test/cuda.txt
@@ -412,7 +412,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/cuda.in
diff --git a/requirements/test/nightly-torch.txt b/requirements/test/nightly-torch.txt
index 0c34cf012031..75928e088dab 100644
--- a/requirements/test/nightly-torch.txt
+++ b/requirements/test/nightly-torch.txt
@@ -23,7 +23,7 @@ jiwer # required for audio tests
 timm # required for internvl test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio] >= 1.11.0 # required for voxtral test
+mistral_common[image,audio] >= 1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
diff --git a/requirements/test/rocm.in b/requirements/test/rocm.in
index cc04eabf09b2..105c9c3527d2 100644
--- a/requirements/test/rocm.in
+++ b/requirements/test/rocm.in
@@ -30,7 +30,7 @@ tblib # for pickling test exceptions
 timm>=1.0.17 # required for internvl and gemma3n-mm test
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[image,audio]>=1.11.0 # required for voxtral test
+mistral_common[image,audio]>=1.11.2 # required for voxtral test
 num2words # required for smolvlm test
 open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
 opencv-python-headless>=4.13.0 # required for video test
diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt
index 801af7db9db1..6c0f8accdeb1 100644
--- a/requirements/test/rocm.txt
+++ b/requirements/test/rocm.txt
@@ -42,6 +42,8 @@ anyio==4.13.0
     #   sse-starlette
     #   starlette
     #   watchfiles
+apache-tvm-ffi==0.1.10
+    # via xgrammar
 arctic-inference==0.1.1
     # via -r requirements/test/rocm.in
 argcomplete==3.6.3
@@ -507,7 +509,7 @@ mcp==1.27.0
     # via -r requirements/test/../common.txt
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
@@ -1264,6 +1266,7 @@ typing-extensions==4.15.0
     #   alembic
     #   anthropic
     #   anyio
+    #   apache-tvm-ffi
     #   azure-core
     #   azure-identity
     #   azure-storage-blob
@@ -1345,7 +1348,7 @@ word2number==1.1
     # via lm-eval
 wrapt==2.1.2
     # via smart-open
-xgrammar==0.1.33
+xgrammar==0.2.0
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/../common.txt
diff --git a/requirements/test/xpu.txt b/requirements/test/xpu.txt
index 601838f843f9..80b0c148116d 100644
--- a/requirements/test/xpu.txt
+++ b/requirements/test/xpu.txt
@@ -266,7 +266,7 @@ mbstrdecoder==1.1.4
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.11.0
+mistral-common==1.11.2
     # via
     #   -c requirements/common.txt
     #   -r requirements/test/xpu.in
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index cee9fa6576e7..f1074e4a6f2f 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,4 +11,4 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
-tpu-inference==0.18.0
+tpu-inference==0.19.0
diff --git a/tests/benchmarks/test_custom_dataset_seed.py b/tests/benchmarks/test_custom_dataset_seed.py
new file mode 100644
index 000000000000..dac87e6e6d98
--- /dev/null
+++ b/tests/benchmarks/test_custom_dataset_seed.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+from pathlib import Path
+
+import pytest
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.benchmarks.datasets import get_samples
+
+
+@pytest.fixture(scope="session")
+def hf_tokenizer() -> PreTrainedTokenizerBase:
+    return AutoTokenizer.from_pretrained("gpt2")
+
+
+def _write_jsonl(path: Path, n_rows: int) -> None:
+    with path.open("w") as f:
+        for i in range(n_rows):
+            f.write(json.dumps({"prompt": f"row {i}: unique prompt content."}) + "\n")
+
+
+def _args_for_custom(dataset_path: str, seed: int) -> argparse.Namespace:
+    return argparse.Namespace(
+        dataset_name="custom",
+        dataset_path=dataset_path,
+        disable_shuffle=False,
+        num_prompts=30,
+        custom_output_len=32,
+        skip_chat_template=True,
+        no_oversample=False,
+        seed=seed,
+        request_id_prefix="",
+    )
+
+
+@pytest.mark.benchmark
+def test_custom_dataset_seed_propagates(
+    hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
+) -> None:
+    """--seed must control the CustomDataset shuffle used by get_samples.
+
+    Without the fix, CustomDataset was instantiated without random_seed,
+    so its load-time shuffle always used DEFAULT_SEED=0 regardless of
+    args.seed, causing every run with --dataset-name custom to pick the
+    same subset of rows from a larger file.
+    """
+    jsonl = tmp_path / "data.jsonl"
+    _write_jsonl(jsonl, n_rows=60)
+
+    samples_a = get_samples(_args_for_custom(str(jsonl), seed=0), hf_tokenizer)
+    samples_b = get_samples(_args_for_custom(str(jsonl), seed=42), hf_tokenizer)
+
+    prompts_a = {s.prompt for s in samples_a}
+    prompts_b = {s.prompt for s in samples_b}
+
+    assert len(prompts_a) == 30
+    assert len(prompts_b) == 30
+    assert prompts_a != prompts_b
+
+
+@pytest.mark.benchmark
+def test_custom_dataset_same_seed_is_deterministic(
+    hf_tokenizer: PreTrainedTokenizerBase, tmp_path: Path
+) -> None:
+    """Same --seed must yield the same CustomDataset subset."""
+    jsonl = tmp_path / "data.jsonl"
+    _write_jsonl(jsonl, n_rows=60)
+
+    samples_a = get_samples(_args_for_custom(str(jsonl), seed=7), hf_tokenizer)
+    samples_b = get_samples(_args_for_custom(str(jsonl), seed=7), hf_tokenizer)
+
+    prompts_a = [s.prompt for s in samples_a]
+    prompts_b = [s.prompt for s in samples_b]
+
+    assert prompts_a == prompts_b
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index d61c128a59b6..87f98946a8ad 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -12,10 +12,17 @@
 from torch.fx._utils import lazy_format_graph_code
 
 from vllm.compilation.passes.fx_utils import find_op_nodes
-from vllm.compilation.passes.inductor_pass import InductorPass
+from vllm.compilation.passes.inductor_pass import (
+    InductorPass,
+    pass_context,
+)
+from vllm.compilation.passes.ir.inplace_functionalization import (
+    VllmIRInplaceFunctionalizationPass,
+)
 from vllm.compilation.passes.pass_manager import with_pattern_match_debug
 from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.utils import Range
 from vllm.logger import init_logger
 
 logger = init_logger("vllm.tests.compile.backend")
@@ -53,11 +60,17 @@ def __init__(self, *passes: InductorPass | Callable[[fx.Graph], None]):
         self.custom_passes = list(passes)
         vllm_config = get_current_vllm_config()
         compile_config = vllm_config.compilation_config
+        self.range = Range(1, vllm_config.scheduler_config.max_num_batched_tokens)
         # Deepcopy to allow multiple TestBackend instances to use the same VllmConfig
         self.inductor_config = deepcopy(compile_config.inductor_compile_config)
         self.inductor_config["force_disable_caches"] = True
         self.inductor_config["post_grad_custom_post_pass"] = self.post_pass
 
+        # Add VllmIRInplaceFunctionalizationPass as pre-grad pass by default
+        self.inductor_config["pre_grad_custom_pass"] = (
+            VllmIRInplaceFunctionalizationPass(vllm_config)
+        )
+
         if debug_dump_path := vllm_config.compile_debug_dump_path():
             logger.debug("Dumping depyf output to %s", debug_dump_path)
             self.debug_ctx = depyf.prepare_debug(debug_dump_path.as_posix())
@@ -68,7 +81,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs):
         self.graph_pre_compile = deepcopy(graph)
         from torch._inductor.compile_fx import compile_fx
 
-        with self.debug_ctx:
+        with self.debug_ctx, pass_context(self.range):
             return compile_fx(
                 graph, example_inputs, config_patches=self.inductor_config
             )
diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py
index ed9c7a351e42..f1ea0e414d76 100644
--- a/tests/compile/fullgraph/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@@ -161,7 +161,14 @@ def _run_simple_model(
 @pytest.mark.parametrize("intermediate_unbacked", [True, False])
 @torch.inference_mode()
 @create_new_process_for_each_test("spawn")
-def test_simple_piecewise_compile(backend, intermediate_unbacked):
+def test_simple_piecewise_compile(backend, intermediate_unbacked, monkeypatch):
+    # `intermediate_unbacked` flips a control-flow branch inside
+    # `SillyModel.forward`, but the AOT-compile cache key only hashes the
+    # forward function's qualname + line number, so both parametrize variants
+    # share the same cache slot. Disabling the cache forces each variant to
+    # compile fresh; otherwise the second-running variant loads the first's
+    # artifact and segfaults with an illegal memory access.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py
index 1f1eeb8b4789..c40d75f6754a 100644
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -88,14 +88,10 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model(self):
-        return (
-            [torch.ops.vllm_ir.rms_norm]
-            + [
-                torch.ops._C.fused_add_rms_norm.default,
-            ]
-            if RMSNorm.enabled()
-            else []
-        )
+        return [
+            torch.ops.vllm_ir.rms_norm,
+            torch.ops.vllm_ir.fused_add_rms_norm,
+        ]
 
 
 class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
@@ -152,16 +148,17 @@ def ops_in_model_before(self):
     def ops_in_model(self):
         if self.vllm_config.compilation_config.pass_config.fuse_norm_quant:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        elif RMSNorm.enabled():
-            return [
-                torch.ops._C.fused_add_rms_norm.default,
-            ]
-        elif any(layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers):
+        else:
+            quant_ops = (
+                [torch.ops._C.static_scaled_fp8_quant.default]
+                if any(layer.is_quant_fp8_enabled() for layer in self.fp8_linear_layers)
+                else [torch.ops.aten.reciprocal]
+            )
             return [
-                torch.ops._C.static_scaled_fp8_quant.default,
+                torch.ops.vllm_ir.rms_norm,
+                torch.ops.vllm_ir.fused_add_rms_norm,
+                *quant_ops,
             ]
-        else:
-            return []
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/tests/compile/passes/ir/test_clone_cleanup.py b/tests/compile/passes/ir/test_clone_cleanup.py
new file mode 100644
index 000000000000..9fedb5fc9177
--- /dev/null
+++ b/tests/compile/passes/ir/test_clone_cleanup.py
@@ -0,0 +1,412 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Comprehensive tests for UnsafeCloneEliminationPass.
+
+This test suite exercises all possible valid FX graph patterns involving clones:
+1. Clone with no users (dead code)
+2. Clone with read-only users
+3. Clone with mutation users
+4. Clone of graph input
+5. Clone with original used after mutation
+6. Clone chains
+"""
+
+import pytest
+import torch
+from torch import fx
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.passes.fx_utils import find_op_nodes
+from vllm.compilation.passes.inductor_pass import get_pass_context, pass_context
+from vllm.compilation.passes.ir.clone_elimination import (
+    UnsafeCloneEliminationPass,
+    user_writes_to_node,
+)
+from vllm.config import VllmConfig
+from vllm.config.utils import Range
+
+
+def count_clones(graph: fx.Graph) -> int:
+    """Count clone nodes in a graph."""
+    return len(list(find_op_nodes(torch.ops.aten.clone.default, graph)))
+
+
+@pytest.fixture(scope="function")
+def clone_cleanup_pass():
+    return UnsafeCloneEliminationPass(VllmConfig())
+
+
+@pytest.fixture(autouse=True)
+def setup_pass_context():
+    """Set up pass context for each test."""
+    with pass_context(compile_range=Range(1, 8192)):
+        yield
+
+
+class TestCloneCleanup:
+    """Test UnsafeCloneEliminationPass behavior on various graph patterns."""
+
+    def test_remove_clone_readonly_users(self, clone_cleanup_pass):
+        """Clone with only read-only users should be removed."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x_clone = x.clone()
+            return x_clone + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_keep_clone_with_mutation_and_original_used_after(self, clone_cleanup_pass):
+        """Clone must be kept if it's mutated AND original is used after mutation."""
+
+        def f(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x = x.relu()  # not a graph param
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x, x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Clone should be KEPT because original is used after mutation
+        assert count_clones(graph_module.graph) == 1
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
+
+    def test_remove_clone_with_mutation_no_original_use(self, clone_cleanup_pass):
+        """Clone can be removed if it's mutated but original is not used after."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x = x.relu()  # not a graph param
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_clone_chain(self, clone_cleanup_pass):
+        """Test handling of clone chains: x -> clone1 -> clone2."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x = x.relu()  # not a graph param
+            x1 = x.clone()
+            x2 = x1.clone()
+            return x2 + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 2
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Both clones should be removed
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_multiple_clones_of_same_input(self, clone_cleanup_pass):
+        """Test multiple independent clones of the same input."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x1 = x.clone()
+            x2 = x.clone()
+            return x1 + x2
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 2
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        # Both clones should be removed (only readonly uses)
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_no_clones_in_graph(self, clone_cleanup_pass):
+        """Test pass behavior when graph has no clones."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 0
+
+        expected = graph_module(inp)
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+        actual = graph_module(inp)
+
+        assert count_clones(graph_module.graph) == 0
+        torch.testing.assert_close(actual, expected)
+
+    def test_multiple_passes(self, clone_cleanup_pass):
+        """Test running the pass multiple times (should be idempotent)."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x1 = x.clone()
+            return x1 + 1
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        expected = graph_module(inp)
+
+        clone_cleanup_pass(graph_module.graph)
+        assert count_clones(graph_module.graph) == 0
+        graph_module.recompile()
+        actual = graph_module(inp)
+        torch.testing.assert_close(actual, expected)
+
+        clone_cleanup_pass(graph_module.graph)
+        assert count_clones(graph_module.graph) == 0
+        graph_module.recompile()
+        actual = graph_module(inp)
+        torch.testing.assert_close(actual, expected)
+
+    def test_output_node_no_write(self):
+        """Output nodes never write to their inputs."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+        output_node = [n for n in graph_module.graph.nodes if n.op == "output"][0]
+
+        assert not user_writes_to_node(output_node, x_node)
+
+    def test_readonly_op_no_write(self):
+        """Readonly operations don't write to inputs."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            return x + y
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        add_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and n.target == torch.ops.aten.add.Tensor
+        ][0]
+
+        assert not user_writes_to_node(add_node, placeholders[0])
+        assert not user_writes_to_node(add_node, placeholders[1])
+
+    def test_inplace_op_writes(self):
+        """Inplace operations write to first argument."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            x.add_(y)
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        add_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and "add_" in str(n.target)
+        ][0]
+
+        # add_ writes to first arg but not second
+        assert user_writes_to_node(add_node, placeholders[0])
+        assert not user_writes_to_node(add_node, placeholders[1])
+
+    def test_copy_writes(self):
+        """copy_ operation writes to first argument."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+            x.copy_(y)
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3), torch.randn(2, 3))
+        placeholders = [n for n in graph_module.graph.nodes if n.op == "placeholder"]
+        copy_node = [
+            n
+            for n in graph_module.graph.nodes
+            if n.op == "call_function" and "copy_" in str(n.target)
+        ][0]
+
+        assert user_writes_to_node(copy_node, placeholders[0])
+        assert not user_writes_to_node(copy_node, placeholders[1])
+
+    def test_auto_functionalized_not_a_write(self):
+        """auto_functionalized ops are follow-up uses, not writes."""
+        from torch._higher_order_ops.auto_functionalize import auto_functionalized
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+
+        # Create an auto_functionalized node in the graph
+        with graph_module.graph.inserting_before(None):
+            af_node = graph_module.graph.call_function(
+                auto_functionalized, kwargs={"input": x_node}
+            )
+
+        # auto_functionalized should not be treated as a write
+        assert not user_writes_to_node(af_node, x_node)
+
+    def test_higher_order_op_conservatively_writes(self):
+        """Other higher-order operators are conservatively treated as writes."""
+        from torch._ops import HigherOrderOperator
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            return x
+
+        graph_module = make_fx(f)(torch.randn(2, 3))
+        x_node = [n for n in graph_module.graph.nodes if n.op == "placeholder"][0]
+
+        # Create a concrete higher-order operator subclass
+        class MockHigherOrderOp(HigherOrderOperator):
+            def __call__(self, *args, **kwargs):
+                return args[0] if args else None
+
+        mock_hoo = MockHigherOrderOp("mock_higher_order_op")
+
+        with graph_module.graph.inserting_before(None):
+            hoo_node = graph_module.graph.call_function(mock_hoo, args=(x_node,))
+
+        # Should be conservative and assume it could write
+        assert user_writes_to_node(hoo_node, x_node)
+
+
+class TestCloneCleanupWithDonatedInputs:
+    """Test UnsafeCloneEliminationPass with donated input tracking via PassContext."""
+
+    @pytest.fixture(autouse=True)
+    def setup_pass_context(self):
+        """Set up pass context for each test."""
+        with pass_context(compile_range=Range(1, 8192)):
+            yield
+
+    def test_donated_input_clone_removed(self, clone_cleanup_pass):
+        """Clone of donated input should be removed."""
+
+        def f(x: torch.Tensor) -> torch.Tensor:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x_clone
+
+        inp = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp)
+        assert count_clones(graph_module.graph) == 1
+
+        # Mark first parameter as donated
+        get_pass_context().donated_input_ids = {0}
+
+        expected = graph_module(inp.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # Clone should be removed since input is donated
+        assert count_clones(graph_module.graph) == 0
+
+        # Input can be mutated (donated)
+        inp_copy = inp.clone()
+        actual = graph_module(inp_copy)
+        torch.testing.assert_close(actual, expected)
+
+    def test_non_donated_input_clone_kept(self, clone_cleanup_pass):
+        """Clone of non-donated input with mutation should be kept."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            return x, x_clone
+
+        inp_x = torch.randn(2, 3)
+        inp_y = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp_x, inp_y)
+        assert count_clones(graph_module.graph) == 1
+
+        # No donated inputs
+        get_pass_context().donated_input_ids = set()
+
+        expected = graph_module(inp_x.clone(), inp_y.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # Clone should be kept since input is not donated and original is used
+        assert count_clones(graph_module.graph) == 1
+
+        # Verify inputs are not mutated
+        inp_x_before = inp_x.clone()
+        inp_y_before = inp_y.clone()
+        actual = graph_module(inp_x, inp_y)
+        torch.testing.assert_close(
+            inp_x, inp_x_before, msg="Input x should not be mutated"
+        )
+        torch.testing.assert_close(
+            inp_y, inp_y_before, msg="Input y should not be mutated"
+        )
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
+
+    def test_mixed_donated_inputs(self, clone_cleanup_pass):
+        """Test with some inputs donated and some not."""
+
+        def f(x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            x_clone = x.clone()
+            x_clone.add_(1)
+            y_clone = y.clone()
+            y_clone.add_(2)
+            return x_clone, y_clone
+
+        inp_x = torch.randn(2, 3)
+        inp_y = torch.randn(2, 3)
+        graph_module = make_fx(f)(inp_x, inp_y)
+        assert count_clones(graph_module.graph) == 2
+
+        # Only x is donated
+        get_pass_context().donated_input_ids = {0}
+
+        expected = graph_module(inp_x.clone(), inp_y.clone())
+        clone_cleanup_pass(graph_module.graph)
+        graph_module.recompile()
+
+        # x_clone removed (x is donated), y_clone kept (y is not donated)
+        assert count_clones(graph_module.graph) == 1
+
+        # Verify y is not mutated (x can be mutated since it's donated)
+        inp_y_before = inp_y.clone()
+        actual = graph_module(inp_x.clone(), inp_y)
+        torch.testing.assert_close(
+            inp_y, inp_y_before, msg="Input y should not be mutated"
+        )
+        torch.testing.assert_close(actual[0], expected[0])
+        torch.testing.assert_close(actual[1], expected[1])
diff --git a/tests/compile/passes/ir/test_inplace_functionalization.py b/tests/compile/passes/ir/test_inplace_functionalization.py
new file mode 100644
index 000000000000..1e8d5662162f
--- /dev/null
+++ b/tests/compile/passes/ir/test_inplace_functionalization.py
@@ -0,0 +1,465 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for IR inplace functionalization pass integration.
+
+This test suite verifies that the inplace functionalization pass, lowering pass,
+and clone cleanup pass work together correctly with donated buffer tracking.
+"""
+
+from collections.abc import Callable
+
+import pytest
+import torch
+import torch._dynamo.exc
+from torch import nn
+
+import vllm.kernels  # noqa: F401 to register kernels
+from vllm.compilation.passes.inductor_pass import InductorPass, get_pass_context
+from vllm.compilation.passes.ir.clone_elimination import (
+    UnsafeCloneEliminationPass,
+)
+from vllm.compilation.passes.ir.inplace_functionalization import (
+    VllmIRInplaceFunctionalizationPass,
+)
+from vllm.compilation.passes.ir.lowering_pass import VllmIRLoweringPass
+from vllm.config import VllmConfig
+from vllm.ir import ops
+from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON, tl, triton
+
+from ...backend import TestBackend
+
+
+class StoreDonationInfoPass(InductorPass):
+    def __init__(self):
+        self.donated_input_ids_sets: list[set[int]] = []
+
+    def __call__(self, *args, **kwargs):
+        ctx = get_pass_context()
+        self.donated_input_ids_sets += [ctx.donated_input_ids]
+
+
+class MaybeInplaceModel(nn.Module):
+    """Model using only maybe_inplace variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First maybe_inplace - x & residual1 are donated
+        x_normed1, residual_out1 = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second maybe_inplace - residual2 is donated
+        x_normed2, residual_out2 = ops.fused_add_rms_norm.maybe_inplace(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        return x_normed2, residual_out1, residual_out2
+
+
+class FunctionalModel(nn.Module):
+    """Model using only functional (default) variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First functional - no donation
+        x_normed1, residual_out1 = ops.fused_add_rms_norm(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second functional - no donation
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        return x_normed2, residual_out1, residual_out2
+
+
+class MixedModel(nn.Module):
+    """Model mixing maybe_inplace and functional variants."""
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight1 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.weight2 = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(
+        self, x: torch.Tensor, residual1: torch.Tensor, residual2: torch.Tensor
+    ):
+        # First maybe_inplace - x & residual1 are donated
+        x_normed1, residual_out1 = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual1, self.weight1, 1e-5
+        )
+        # Second functional - no donation, x_normed1 must be preserved as it's returned
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_normed1, residual2, self.weight2, 1e-5
+        )
+        # Return both to prevent x_normed1 from being optimized away
+        return x_normed1, x_normed2, residual_out1, residual_out2
+
+
+class ModelWithTritonAfterMaybeInplace(nn.Module):
+    """
+    Model using maybe_inplace followed by a Triton kernel.
+    Test clone elimination can handle Triton in the graph
+    """
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+        @triton.jit
+        def _triton_add_kernel(
+            x_ptr,
+            y_ptr,
+            n_elements,
+            BLOCK_SIZE: tl.constexpr,
+        ):
+            pid = tl.program_id(axis=0)
+            block_start = pid * BLOCK_SIZE
+            offsets = block_start + tl.arange(0, BLOCK_SIZE)
+            mask = offsets < n_elements
+            x = tl.load(x_ptr + offsets, mask=mask)
+            y = x + 0.1
+            tl.store(y_ptr + offsets, y, mask=mask)
+
+        def triton_add(x: torch.Tensor) -> torch.Tensor:
+            """Simple Triton add kernel."""
+            y = torch.empty_like(x)
+            n_elements = x.numel()
+            grid = (triton.cdiv(n_elements, 256),)
+            _triton_add_kernel[grid](x, y, n_elements, BLOCK_SIZE=256)
+            return y
+
+        self.triton_add = triton_add
+
+    def forward(self, x: torch.Tensor, residual: torch.Tensor, residual2: torch.Tensor):
+        x_normed, residual_out = ops.fused_add_rms_norm.maybe_inplace(
+            x, residual, self.weight, 1e-5
+        )
+
+        x_processed = self.triton_add(x_normed)
+
+        # x_processed does not need to be cloned, residual2 does
+        x_normed2, residual_out2 = ops.fused_add_rms_norm(
+            x_processed, residual2, self.weight, 1e-5
+        )
+        return x_normed2, residual_out2
+
+
+skipif_no_triton = pytest.mark.skipif(not HAS_TRITON, reason="Requires Triton")
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+@pytest.mark.parametrize(
+    "model_class,expected_functionalized,expected_donated,expected_clones",
+    [
+        # 2 inplace calls, all activations donated, all clones eliminated
+        (MaybeInplaceModel, 2, 3, 0),
+        # No inplace calls, no donations, 3 clones (one eliminated)
+        (FunctionalModel, 0, 0, 3),
+        # One inplace call, two donated activations, 2 clones
+        (MixedModel, 1, 2, 2),
+        # One inplace call, two donated, 1 clone remaining
+        pytest.param(ModelWithTritonAfterMaybeInplace, 1, 2, 1, marks=skipif_no_triton),
+    ],
+)
+def test_inplace_functionalization(
+    default_vllm_config: VllmConfig,
+    model_class,
+    expected_functionalized: int,
+    expected_clones: int,
+    expected_donated: int,
+):
+    """Test inplace functionalization, lowering, and clone cleanup."""
+    torch.set_default_device(current_platform.device_type)
+
+    # Use vllm_c so inplace path is triggered
+    default_vllm_config.kernel_config.ir_op_priority.fused_add_rms_norm = [
+        "vllm_c",
+        "native",
+    ]
+
+    # Create passes in order they run during compilation
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+    donated_info_pass = StoreDonationInfoPass()
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    # Set up backend with pre-grad pass
+    backend = TestBackend(lowering_pass, donated_info_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = model_class()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual1 = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual2 = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    with default_vllm_config.kernel_config.ir_op_priority.set_priority():
+        # Reference output without optimization
+        ref_output = model(x.clone(), residual1.clone(), residual2.clone())
+
+        # Compile with inplace optimization
+        compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+        output = compiled_model(x.clone(), residual1.clone(), residual2.clone())
+
+    # Verify correctness (relaxed tolerance for bfloat16)
+    for i in range(len(ref_output)):
+        torch.testing.assert_close(output[i], ref_output[i], rtol=1e-2, atol=1e-2)
+
+    # Verify expected number of ops were functionalized
+    func_ops = functionalization_pass.functionalized_ops
+    assert len(func_ops) == int(bool(expected_functionalized))
+    if expected_functionalized > 0:
+        assert "fused_add_rms_norm" in func_ops
+        assert func_ops["fused_add_rms_norm"] == expected_functionalized
+
+    # Verify lowering happened (2 ops in all cases)
+    assert "fused_add_rms_norm" in lowering_pass.selected_impls
+    assert len(lowering_pass.selected_impls["fused_add_rms_norm"]) == 2
+    assert all(
+        provider == "vllm_c"
+        for node, provider in lowering_pass.selected_impls["fused_add_rms_norm"].items()
+    ), lowering_pass.selected_impls
+
+    # Verify correct number of donated IDs
+    assert len(donated_info_pass.donated_input_ids_sets) == 1
+    assert len(donated_info_pass.donated_input_ids_sets[0]) == expected_donated
+
+    # Verify expected number of clones after cleanup
+    actual_clones = backend.op_count(torch.ops.aten.clone.default, before=False)
+    assert actual_clones == expected_clones, (
+        f"Expected {expected_clones} clones, got {actual_clones}:"
+        f"{backend.print_graphs()}"
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_donated_buffer_context_propagation(default_vllm_config):
+    """Test that donated_input_ids propagates correctly through pass_context."""
+    torch.set_default_device(current_platform.device_type)
+
+    # Create a custom backend that inspects pass_context in cleanup pass
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+
+    donation_info_pass = StoreDonationInfoPass()
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    backend = TestBackend(lowering_pass, donation_info_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = MaybeInplaceModel()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual1 = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual2 = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+    compiled_model(x.clone(), residual1.clone(), residual2.clone())
+
+    donated_ids_seen = donation_info_pass.donated_input_ids_sets
+    # Verify donated_input_ids was set and propagated
+    assert len(donated_ids_seen) == 1
+    # Should have donated inputs (exact indices depend on AOTAutograd)
+    assert len(donated_ids_seen[0]) == 3
+    # All donated ids should be valid non-negative integers
+    for idx in donated_ids_seen[0]:
+        assert isinstance(idx, int) and idx >= 0, f"Invalid donated index: {idx}"
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_maybe_inplace_reuse_error(default_vllm_config):
+    """Test that reusing a donated activation input raises ValueError."""
+    torch.set_default_device(current_platform.device_type)
+
+    class ReuseModel(nn.Module):
+        """Model that incorrectly reuses a donated activation input."""
+
+        def __init__(self, hidden_size=16):
+            super().__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+        def forward(self, x: torch.Tensor, residual: torch.Tensor):
+            # x is donated to maybe_inplace
+            x_normed, residual_out = ops.fused_add_rms_norm.maybe_inplace(
+                x, residual, self.weight, 1e-5
+            )
+            # ERROR: x is used again after being donated
+            return x_normed + x  # This should raise ValueError
+
+    functionalization_pass = VllmIRInplaceFunctionalizationPass(default_vllm_config)
+    lowering_pass = VllmIRLoweringPass(default_vllm_config)
+    cleanup_pass = UnsafeCloneEliminationPass(default_vllm_config)
+
+    backend = TestBackend(lowering_pass, cleanup_pass)
+    backend.inductor_config["pre_grad_custom_pass"] = functionalization_pass
+
+    model = ReuseModel()
+    x = torch.randn(8, 16, dtype=torch.bfloat16)
+    residual = torch.randn(8, 16, dtype=torch.bfloat16)
+
+    # Compilation should raise BackendCompilerFailed wrapping ValueError
+    with pytest.raises(
+        torch._dynamo.exc.BackendCompilerFailed,
+        match="is used again after the node",
+    ):
+        compiled_model = torch.compile(model, backend=backend, fullgraph=True)
+        compiled_model(x.clone(), residual.clone())
+
+
+# Piecewise compilation tests with graph splitting
+
+
+@torch.library.custom_op("vllm::test_split_marker", mutates_args=())
+def test_split_marker(x: torch.Tensor) -> torch.Tensor:
+    """Identity op that marks a split point for piecewise compilation."""
+    return x.clone()
+
+
+@test_split_marker.register_fake
+def _fake_split_marker(x: torch.Tensor) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+class TransformerBlockWithSplits(nn.Module):
+    """Transformer block with explicit split points for piecewise compilation."""
+
+    def __init__(self, hidden_size=32, intermediate_size=128):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+
+        # Attention-like projection
+        self.attn_proj = nn.Linear(
+            hidden_size, hidden_size, bias=False, dtype=torch.bfloat16
+        )
+
+        # Post-attention norm
+        self.post_attn_norm = nn.Parameter(
+            torch.ones(hidden_size, dtype=torch.bfloat16)
+        )
+
+        # MLP
+        self.gate_proj = nn.Linear(
+            hidden_size, intermediate_size, bias=False, dtype=torch.bfloat16
+        )
+        self.up_proj = nn.Linear(
+            hidden_size, intermediate_size, bias=False, dtype=torch.bfloat16
+        )
+        self.down_proj = nn.Linear(
+            intermediate_size, hidden_size, bias=False, dtype=torch.bfloat16
+        )
+
+        # Post-MLP norm
+        self.post_mlp_norm = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+
+    def forward(self, x: torch.Tensor):
+        # Attention block with residual
+        residual1 = x
+        attn_out = self.attn_proj(x)
+
+        # Fused add + norm (maybe_inplace: residual1 is donated)
+        normed1, residual1 = ops.fused_add_rms_norm.maybe_inplace(
+            attn_out, residual1, self.post_attn_norm, 1e-5
+        )
+
+        # Force a graph split here
+        normed1 = torch.ops.vllm.test_split_marker(normed1)
+
+        # MLP block
+        gate = self.gate_proj(normed1)
+        up = self.up_proj(normed1)
+        mlp_out = self.down_proj(gate * torch.nn.functional.silu(up))
+
+        # Fused add + norm (maybe_inplace: residual1 is donated)
+        normed2, residual2 = ops.fused_add_rms_norm.maybe_inplace(
+            mlp_out, residual1, self.post_mlp_norm, 1e-5
+        )
+
+        return normed2, residual2
+
+
+def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
+    def inner(*args):
+        torch._dynamo.mark_dynamic(args[arg_index], dim_index)
+        return fn(*args)
+
+    return inner
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(),
+    reason="Only test on cuda and rocm platform",
+)
+def test_piecewise_compilation_with_donated_buffers(monkeypatch, fresh_vllm_cache):
+    """
+    Test piecewise compilation with donated buffers across graph splits.
+    Utilizes a custom splitting op. Uses fresh cache to avoid compilation caching.
+    """
+    torch.set_default_device(current_platform.device_type)
+
+    # Disable compilation cache to avoid serialization issues
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    from vllm.compilation.backends import VllmBackend
+    from vllm.config import CompilationConfig, VllmConfig
+
+    # Create config with custom splitting op
+    store_donation_info = StoreDonationInfoPass()
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(
+            custom_ops=["all"],
+            splitting_ops=["vllm::test_split_marker"],
+            inductor_compile_config={"post_grad_custom_post_pass": store_donation_info},
+        )
+    )
+
+    backend = VllmBackend(vllm_config)
+
+    model = TransformerBlockWithSplits()
+    x = torch.randn(8, 32, dtype=torch.bfloat16)
+
+    # Reference output
+    ref_output = with_dyn_arg(model, 0, 0)(x.clone())
+
+    # Compile with piecewise compilation (graph will split at split_marker)
+    compiled_model = torch.compile(model, backend=backend, fullgraph=False)
+    output = with_dyn_arg(compiled_model, 0, 0)(x.clone())
+
+    # Verify correctness (relaxed tolerance for bfloat16)
+    torch.testing.assert_close(output[0], ref_output[0], rtol=1e-2, atol=1e-2)
+    torch.testing.assert_close(output[1], ref_output[1], rtol=1e-2, atol=1e-2)
+
+    # Verify the model was split into multiple submodules
+    assert hasattr(backend, "split_gm"), "Backend should have split graph module"
+
+    # Should have at least 2 submodules (split by test_split_marker op)
+    submodules = list(backend.split_gm.named_children())
+    num_submodules = len(submodules)
+    assert num_submodules >= 2, (
+        f"Expected at least 2 submodules (split), got {num_submodules}"
+    )
+
+    # Check that donation info was propagated correctly
+    donated_inputs_sets = store_donation_info.donated_input_ids_sets
+    assert len(donated_inputs_sets) == 2
+    assert len(donated_inputs_sets[0]) == 1
+    assert len(donated_inputs_sets[1]) == 1
diff --git a/tests/compile/passes/test_functionalization.py b/tests/compile/passes/test_functionalization.py
index 9a03a6988763..31bf225d4135 100644
--- a/tests/compile/passes/test_functionalization.py
+++ b/tests/compile/passes/test_functionalization.py
@@ -126,7 +126,7 @@ def ops_in_model(self, do_fusion):
         if TEST_FP8 and do_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
         else:
-            return [torch.ops._C.fused_add_rms_norm.default]
+            return []
 
     def ops_not_in_model(self):
         return []
diff --git a/tests/compile/passes/test_fuse_act_padding.py b/tests/compile/passes/test_fuse_act_padding.py
index f3f3bda47277..bfbe762abdb6 100644
--- a/tests/compile/passes/test_fuse_act_padding.py
+++ b/tests/compile/passes/test_fuse_act_padding.py
@@ -59,7 +59,7 @@ def forward(self, x):
 
     def ops_in_model_before(self):
         return [
-            rocm_aiter_ops.get_rmsnorm_fused_add_op(),
+            torch.ops.vllm_ir.fused_add_rms_norm,
             torch.ops.aten.constant_pad_nd,
         ]
 
diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py
index 32803aad8c1c..2feb0bc4f787 100644
--- a/tests/compile/passes/test_fusion.py
+++ b/tests/compile/passes/test_fusion.py
@@ -17,7 +17,6 @@
     FusedRMSQuantKey,
     RMSNormQuantFusionPass,
 )
-from vllm.compilation.passes.fx_utils import find_op_nodes
 from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
 from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
 from vllm.config import (
@@ -243,9 +242,10 @@ def ops_in_model_after(self):
         ]
 
     def ops_in_model_before_partial(self):
-        return [torch.ops.vllm_ir.rms_norm] + (
-            [RMS_ADD_OP] if self.enable_rms_norm_custom_op else [torch.ops.aten.rsqrt]
-        )
+        return [
+            torch.ops.vllm_ir.rms_norm,
+            torch.ops.vllm_ir.fused_add_rms_norm.default,
+        ]
 
 
 def _run_fusion_test(
@@ -383,17 +383,6 @@ def test_fusion_rmsnorm_quant(
             model.ops_in_model_before_partial(), fully_replaced=False
         )
 
-        # If RMSNorm custom op is disabled (native/torch impl used),
-        # there's a risk that the fused add doesn't get included in the
-        # replacement and only the rms part gets fused with quant.
-        # Hence, we check only 2 add nodes are left (final fused rmsnorm add).
-        if not enable_rms_norm_custom_op:
-            n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g))
-            # rms_norm is IR, not included
-            # 6 = 3x2 (3xRMS_ADD, 2 each)
-            assert n_add_nodes(backend.graph_pre_pass) == 6
-            assert n_add_nodes(backend.graph_post_pass) == 2
-
 
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [256])
diff --git a/tests/compile/test_codegen.py b/tests/compile/test_codegen.py
new file mode 100644
index 000000000000..21db287a2478
--- /dev/null
+++ b/tests/compile/test_codegen.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for vllm.compilation.codegen — execution code generation.
+
+Each test runs a real Python function through the same pipeline vLLM uses
+in production: ``make_fx`` to obtain an aten-level fx graph, ``split_graph``
+to split it into the stitching layer + submodules, and then
+``generate_execution_code``/``compile_execution_fn`` for codegen.
+"""
+
+from collections.abc import Callable
+
+import pytest
+import regex as re
+import torch
+import torch.fx as fx
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+from vllm.compilation.codegen import (
+    _node_ref,
+    compile_execution_fn,
+    generate_execution_code,
+    generate_execution_code_with_name,
+)
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def _trace_and_split(
+    model_fn: Callable[..., torch.Tensor],
+    example_inputs: tuple[torch.Tensor, ...],
+    split_ops: list[str],
+) -> fx.GraphModule:
+    """Trace ``model_fn`` with make_fx, then split on the named aten ops."""
+    gm = make_fx(model_fn)(*example_inputs)
+    split_gm, _ = split_graph(gm, split_ops)
+    return split_gm
+
+
+def _to_copy_model(x: torch.Tensor) -> torch.Tensor:
+    """Traces to ``aten._to_copy.default`` with device + dtype kwargs."""
+    return x.to(device=torch.device("cpu"), dtype=torch.float16)
+
+
+def _empty_model(x: torch.Tensor) -> torch.Tensor:
+    """Traces to ``aten.empty.memory_format`` with device + dtype kwargs."""
+    buf = torch.empty(x.shape, device=torch.device("cpu"), dtype=torch.float16)
+    return buf.fill_(0).add(x.to(dtype=torch.float16))
+
+
+@pytest.fixture
+def x() -> torch.Tensor:
+    return torch.zeros(2, 3)
+
+
+@pytest.mark.parametrize(
+    "model_fn,split_ops",
+    [
+        (_to_copy_model, ["aten::_to_copy.default"]),
+        (_empty_model, []),
+    ],
+    ids=["aten::_to_copy.default", "aten::empty.memory_format"],
+)
+def test_non_primitive_kwargs_lifted_to_consts(
+    model_fn: Callable[[torch.Tensor], torch.Tensor],
+    split_ops: list[str],
+    x: torch.Tensor,
+) -> None:
+    """Regression: arguments whose ``repr()`` is not a valid Python
+    expression in the generated function's namespace (notably
+    ``torch.device``) used to be inlined via ``repr()``, producing source
+    like
+
+        out = torch.ops.aten._to_copy.default(x, device=device(type='cpu'))
+
+    which fails at call time — only ``torch`` and ``operator`` are imported
+    into the namespace, so ``device`` is unbound. The fix collects such
+    objects into ``__vllm_consts__`` and references them by index. The
+    unqualified ``device(type=...)`` form must never appear in the
+    generated source."""
+    split_gm = _trace_and_split(model_fn, (x,), split_ops)
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert "device(type=" not in code, (
+        "Generated code contains unqualified `device(type=...)` from repr(); "
+        "torch.device should be lifted into __vllm_consts__"
+    )
+    assert torch.device("cpu") in consts, "torch.device kwarg not lifted to consts"
+    assert torch.float16 in consts, "torch.dtype kwarg not lifted to consts"
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    out = fn(x)
+    expected = model_fn(x)
+    assert torch.equal(out, expected), "Compiled output does not match reference"
+
+
+def test_dtype_singleton_deduped(x: torch.Tensor) -> None:
+    """``torch.float16`` is a process-wide singleton, so two ops referring
+    to it in the traced graph share a single consts slot via ``id()``-based
+    dedup. Distinct expressions (``x.to(...)`` vs ``(x*2).to(...)``) ensure
+    the tracer can't CSE the two ops into a single node."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.to(dtype=torch.float16) + (x * 2).to(dtype=torch.float16)
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    # The traced graph must have two distinct _to_copy nodes (otherwise the
+    # dedup assertion below is trivially satisfied).
+    n_to_copy = sum(
+        1
+        for n in split_gm.graph.nodes
+        if n.op == "call_module"
+        for sn in getattr(split_gm, n.target).graph.nodes
+        if sn.op == "call_function" and "to_copy" in sn.name
+    )
+    assert n_to_copy >= 2, (
+        f"Test setup failed: expected ≥2 _to_copy nodes, got {n_to_copy}"
+    )
+
+    assert consts.count(torch.float16) == 1, (
+        f"torch.float16 should occupy exactly one slot, got consts={consts}"
+    )
+    assert code.count("__vllm_consts__[0]") >= 2, (
+        "Deduped const slot should be referenced from both _to_copy nodes"
+    )
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_distinct_dtypes_get_distinct_slots(x: torch.Tensor) -> None:
+    """Distinct dtype singletons in the traced graph occupy distinct slots."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.to(dtype=torch.float16) + x.to(dtype=torch.bfloat16)
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    _, _, consts = generate_execution_code(split_gm)
+
+    assert torch.float16 in consts
+    assert torch.bfloat16 in consts
+    assert len(consts) == 2, f"Expected 2 distinct dtype slots, got {consts}"
+
+
+def test_consts_ordering_deterministic(x: torch.Tensor) -> None:
+    """Two independent traces of the same model must produce equal consts
+    lists *in the same order*. Cache artifacts identify const slots by
+    index, so a non-deterministic order would invalidate cached code."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # Multiple distinct non-primitives encountered in a fixed graph order.
+        a = x.to(device=torch.device("cpu"), dtype=torch.float16)
+        return a.to(dtype=torch.bfloat16)
+
+    _, _, consts1 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
+    _, _, consts2 = generate_execution_code(_trace_and_split(model_fn, (x,), []))
+
+    assert len(consts1) >= 2, "Test setup: model should produce ≥2 const slots"
+    assert consts1 == consts2, (
+        f"consts ordering must be reproducible across traces; "
+        f"got {consts1} vs {consts2}"
+    )
+
+
+def test_primitive_args_inlined(x: torch.Tensor) -> None:
+    """Primitive args (int dim, etc.) stay inline as repr — no consts."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return torch.transpose(x, 0, 1).relu()
+
+    split_gm = _trace_and_split(model_fn, (x,), [])
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert consts == [], "Primitive-only graph must produce empty consts"
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_consts_shared_across_split_submods(x: torch.Tensor) -> None:
+    """Dedup must apply across inlined submodules, not just within one.
+
+    The function below splits into three inlined submods, two of which
+    independently reference ``torch.float16``. The shared ``const_index``
+    threaded through recursive ``generate_execution_code_with_name`` calls
+    must collapse the dtype to a single slot used from both submods."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        a = x.to(dtype=torch.float16)  # submod_0: _to_copy(fp16)
+        b = a.relu()  # submod_1: relu (split point)
+        c = b.to(dtype=torch.float32)  # submod_2: _to_copy(fp32)
+        return c.to(dtype=torch.float16) + 1  # submod_2: another _to_copy(fp16)
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+
+    n_submods = sum(1 for _ in split_gm.named_children())
+    assert n_submods >= 3, (
+        f"Test setup failed: expected ≥3 submods after split, got {n_submods}"
+    )
+
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert consts.count(torch.float16) == 1, (
+        f"fp16 singleton must dedup across submods, got consts={consts}"
+    )
+
+    # Find the consts index for fp16 and confirm at least two distinct
+    # inlined submods reference it. This rules out the false-positive where
+    # one submod references it twice and the other not at all.
+    fp16_idx = consts.index(torch.float16)
+    submod_bodies = re.findall(
+        r"def __vllm_inlined_submods__(\d+)\([^)]*\):\n((?:    .*\n)+)", code
+    )
+    assert len(submod_bodies) >= 2
+    referencing_submods = [
+        name for name, body in submod_bodies if f"__vllm_consts__[{fp16_idx}]" in body
+    ]
+    assert len(referencing_submods) >= 2, (
+        f"fp16 slot should be referenced from ≥2 inlined submods, "
+        f"got {referencing_submods}"
+    )
+
+    fn = compile_execution_fn(code, {}, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+def test_non_graphmodule_submod_uses_indexed_callable(x: torch.Tensor) -> None:
+    """When a child of split_gm is *not* a ``torch.fx.GraphModule`` — as
+    happens in production once ``PiecewiseBackend`` replaces submods —
+    codegen emits ``__vllm_submods__[idx](...)`` instead of inlining, and
+    the runtime callable is bound from ``submod_callables``."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid()
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+
+    # Find a GraphModule child and wrap it in a non-GraphModule nn.Module
+    # that delegates to the original — this is the structural shape vLLM
+    # produces after PiecewiseBackend takes over a submod.
+    child_names = [name for name, _ in split_gm.named_children()]
+    target_name = child_names[0]
+
+    class NonGMWrapper(torch.nn.Module):
+        def __init__(self, gm: fx.GraphModule) -> None:
+            super().__init__()
+            self.gm = gm
+
+        def forward(self, *args, **kwargs):
+            return self.gm(*args, **kwargs)
+
+    original = getattr(split_gm, target_name)
+    del split_gm._modules[target_name]
+    split_gm.add_module(target_name, NonGMWrapper(original))
+
+    code, submod_names, consts = generate_execution_code(split_gm)
+
+    assert "__vllm_submods__[" in code, (
+        "Non-GraphModule submod should produce an indexed callable reference"
+    )
+    assert target_name in submod_names
+
+    submod_callables = {
+        name: getattr(split_gm, name)
+        for name in submod_names
+        if not isinstance(getattr(split_gm, name), fx.GraphModule)
+    }
+    fn = compile_execution_fn(code, submod_callables, submod_names, consts)
+    assert torch.equal(fn(x), model_fn(x))
+
+
+# split_graph only passes tuple_return=True to split_module on PyTorch >= 2.12,
+# so getitem nodes only appear in the stitching graph from that version onward.
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.12.0.dev"),
+    reason="split_module tuple_return requires PyTorch >= 2.12",
+)
+def test_getitem_in_stitching_graph(x: torch.Tensor) -> None:
+    """``operator.getitem`` on submod tuple returns is the ``call_function``
+    special case at codegen.py — emitted as ``name = source[index]``
+    rather than a function call."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid()
+
+    split_gm = _trace_and_split(model_fn, (x,), ["aten::relu.default"])
+    code, _, _ = generate_execution_code(split_gm)
+
+    # split_module wraps each submod return in a tuple, so the stitching
+    # graph unpacks via getitem. The codegen must emit it as indexing.
+    assert re.search(r"\b\w+ = \w+\[\d+\]\n", code), (
+        "Stitching graph should emit `name = source[N]` for getitem nodes"
+    )
+
+
+def test_del_emitted_for_intermediate_values(x: torch.Tensor) -> None:
+    """The codegen schedules ``del`` after a value's last use to free
+    memory early. Multi-submod splits naturally have intermediates whose
+    last use is not the output node."""
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        return x.relu().sigmoid().tanh()
+
+    split_gm = _trace_and_split(
+        model_fn, (x,), ["aten::relu.default", "aten::sigmoid.default"]
+    )
+    code, _, _ = generate_execution_code(split_gm)
+
+    assert re.search(r"^    del \w+", code, re.MULTILINE), (
+        "Liveness analysis should emit `del` for intermediates with "
+        "last-use before the output"
+    )
+
+
+def test_with_submod_false_rejects_call_module() -> None:
+    """``generate_execution_code_with_name(with_submod=False)`` is the
+    recursive entry for inlining a GraphModule into its parent. It must
+    refuse a graph that itself contains ``call_module`` nodes — the parent
+    is responsible for handling those."""
+    g = fx.Graph()
+    x_node = g.placeholder("x")
+    root = torch.nn.Module()
+    root.add_module("inner", torch.nn.Identity())
+    call = g.call_module("inner", args=(x_node,))
+    g.output(call)
+    gm = fx.GraphModule(root, g)
+
+    with pytest.raises(RuntimeError, match="call_module is not allowed"):
+        generate_execution_code_with_name(gm, "f", with_submod=False)
+
+
+def test_node_ref_recurses_through_containers() -> None:
+    """``_node_ref`` is the recursive walker that lifts non-primitives
+    nested inside list/tuple/dict args. Real aten ops rarely produce such
+    structures, but the path is needed for DTensor placement lists and
+    other future cases — unit-test the walker directly."""
+    consts: list = []
+    const_index: dict[int, int] = {}
+    cpu = torch.device("cpu")
+
+    # Non-primitive in a list, primitive alongside.
+    assert _node_ref([cpu, 1], consts, const_index) == "[__vllm_consts__[0], 1]"
+    assert consts == [cpu]
+
+    # Same object in a tuple — id-based dedup reuses the existing slot.
+    assert _node_ref((cpu, 2), consts, const_index) == "(__vllm_consts__[0], 2)"
+    assert consts == [cpu]
+
+    # Single-element tuple uses the trailing-comma form.
+    assert _node_ref((cpu,), consts, const_index) == "(__vllm_consts__[0],)"
+
+    # Dict value lifts the same way.
+    ref = _node_ref({"k": cpu}, consts, const_index)
+    assert ref == "{'k': __vllm_consts__[0]}"
+
+
+def test_legacy_code_without_consts() -> None:
+    """``compile_execution_fn(consts=None)`` must still load code that has
+    no ``__vllm_consts__`` reference, so older serialized cache artifacts
+    keep working."""
+    # Pre-consts codegen: no __vllm_consts__ reference, only torch/operator.
+    legacy_code = (
+        "import torch\n"
+        "def execution_fn(x, *, __vllm_submods__):\n"
+        "    return __vllm_submods__[0](x) + 1\n"
+    )
+
+    class AddOne(torch.nn.Module):
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return x + 1
+
+    fn = compile_execution_fn(legacy_code, {"sub": AddOne()}, ["sub"], consts=None)
+    out = fn(torch.zeros(3))
+    assert torch.equal(out, torch.full((3,), 2.0))
diff --git a/tests/conftest.py b/tests/conftest.py
index 40adeda2bd50..779bd475f34b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -996,6 +996,8 @@ def generate(
             req_sample_output_ids: list[list[int]] = []
             req_sample_output_strs: list[str] = []
             req_logprobs = []
+            if req_output.prompt_logprobs:
+                req_logprobs.extend(req_output.prompt_logprobs)
             for sample in req_output.outputs:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
diff --git a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
index 965b21351302..839793fde856 100644
--- a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
@@ -518,7 +518,13 @@ async def test_inconsistent_tool_choice_and_tools(
 
 
 @pytest.mark.asyncio
-async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
+@pytest.mark.parametrize(
+    "tool_choice",
+    ["required", {"type": "function", "function": {"name": "get_current_weather"}}],
+)
+async def test_max_tokens_with_tool_choice_required(
+    client: openai.AsyncOpenAI, tool_choice
+):
     """ """
     models = await client.models.list()
     model_name: str = models.data[0].id
@@ -530,7 +536,7 @@ async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
         max_completion_tokens=1,
         model=model_name,
         tools=tools,
-        tool_choice="required",
+        tool_choice=tool_choice,
     )
     # When `tool_choice="required"` and the tokens of `tools` exceed `max_tokens`,
     # both `tool_calls` and `content` should be empty.
@@ -538,4 +544,3 @@ async def test_max_tokens_with_tool_choice_required(client: openai.AsyncOpenAI):
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert len(choice.message.tool_calls) == 0
-    assert choice.message.content == ""
diff --git a/tests/entrypoints/openai/chat_completion/test_serving_chat.py b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
index df4d5ad47ca2..c44e07a4c10d 100644
--- a/tests/entrypoints/openai/chat_completion/test_serving_chat.py
+++ b/tests/entrypoints/openai/chat_completion/test_serving_chat.py
@@ -807,6 +807,57 @@ async def test_serving_chat_should_set_correct_max_tokens():
     assert mock_engine.generate.call_args.args[1].max_tokens == 5
 
 
+@pytest.mark.asyncio
+async def test_serving_chat_truncate_prompt_tokens_max_token_accounting():
+    """When truncate_prompt_tokens is set, max_tokens must be calculated using
+    the truncated prompt length, not the original prompt length.
+
+    Regression: without the fix, get_max_tokens received the untruncated prompt
+    length, causing the output budget to be underestimated.
+    """
+    mock_engine = MagicMock(spec=AsyncLLM)
+    mock_engine.errored = False
+    mock_engine.model_config = MockModelConfig()
+    mock_engine.input_processor = MagicMock()
+    mock_engine.renderer = _build_renderer(mock_engine.model_config)
+
+    serving_chat = _build_serving_chat(mock_engine)
+
+    # "what is 1+1?" tokenizes to 7 tokens with the test chat template
+    # (max_model_len=100 -> max_tokens = 93 without truncation, confirmed by
+    # test_serving_chat_should_set_correct_max_tokens above).
+    messages = [{"role": "user", "content": "what is 1+1?"}]
+
+    # Baseline: no truncation -> max_tokens = 100 - 7 = 93.
+    req = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+    # With truncate_prompt_tokens=5 (less than 7): the effective prompt length
+    # is 5, so max_tokens should be 100 - 5 = 95, not 93.
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=messages,
+        truncate_prompt_tokens=5,
+    )
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 95
+
+    # With truncate_prompt_tokens=-1 (meaning use full max_model_len as the
+    # truncation limit, i.e., no practical truncation vs the window): effective
+    # length = min(7, 100) = 7 -> max_tokens = 93 again.
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=messages,
+        truncate_prompt_tokens=-1,
+    )
+    with suppress(Exception):
+        await serving_chat.create_chat_completion(req)
+    assert mock_engine.generate.call_args.args[1].max_tokens == 93
+
+
 @pytest.mark.asyncio
 async def test_serving_chat_mistral_token_ids_prompt_is_validated():
     """Regression test: when the Mistral tokenizer path returns token IDs
diff --git a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
index d7a601114b21..ae2b597e13ac 100644
--- a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
+++ b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
@@ -287,3 +287,47 @@ async def plain_call():
                 f"index {i}: reasoning decode token ids ({n_reason}) != "
                 f"thinking_token_budget ({expected_budget})"
             )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
+async def test_streaming_with_thinking_disabled_stays_in_content(
+    client: openai.AsyncOpenAI,
+):
+    request_kwargs = {
+        "model": MODEL_NAME,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Which is larger, 4 or 12?"
+                " Output exactly one token: 4 or 12.",
+            }
+        ],
+        "max_tokens": 16,
+        "temperature": 0.0,
+        "extra_body": {"chat_template_kwargs": {"enable_thinking": False}},
+    }
+
+    response = await client.chat.completions.create(**request_kwargs)
+    message = response.choices[0].message
+    assert message.content is not None and message.content.strip() != ""
+    assert getattr(message, "reasoning", None) in (None, "")
+
+    stream = await client.chat.completions.create(
+        **request_kwargs,
+        stream=True,
+    )
+
+    content_chunks = []
+    reasoning_chunks = []
+    async for chunk in stream:
+        if not chunk.choices:
+            continue
+        delta = chunk.choices[0].delta
+        if getattr(delta, "content", None):
+            content_chunks.append(delta.content)
+        if getattr(delta, "reasoning", None):
+            reasoning_chunks.append(delta.reasoning)
+
+    assert "".join(content_chunks).strip() != ""
+    assert reasoning_chunks == []
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index f17f6f5f90c2..fedbd74795b5 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -26,6 +26,10 @@
 from ....models.registry import HF_EXAMPLE_MODELS
 from ....utils import RemoteOpenAIServer
 
+# Tuned to prevent OOM on 18GB GPUs in transcription correctness tests.
+MAX_SEQS_FOR_TRANSCRIPTION_TEST = 8
+GPU_UTIL_FOR_TRANSCRIPTION_TEST = 0.5
+
 
 def to_bytes(y, sr):
     buffer = io.BytesIO()
@@ -184,6 +188,8 @@ def test_wer_correctness(
     server_args = [
         "--enforce-eager",
         f"--tokenizer_mode={model_info.tokenizer_mode}",
+        f"--max_num_seqs={MAX_SEQS_FOR_TRANSCRIPTION_TEST}",
+        f"--gpu_memory_utilization={GPU_UTIL_FOR_TRANSCRIPTION_TEST}",
     ]
     if model_info.trust_remote_code:
         server_args.append("--trust-remote-code")
diff --git a/tests/entrypoints/openai/test_tool_choice_content_none.py b/tests/entrypoints/openai/test_tool_choice_content_none.py
new file mode 100644
index 000000000000..c1da5918697c
--- /dev/null
+++ b/tests/entrypoints/openai/test_tool_choice_content_none.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.parser.abstract_parser import DelegatingParser
+
+pytestmark = pytest.mark.skip_global_cleanup
+
+
+class _DummyDelegatingParser(DelegatingParser):
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return False
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return input_ids
+
+    def extract_reasoning(self, model_output: str, request):
+        return None, model_output
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: list[int],
+        current_token_ids: list[int],
+        delta_token_ids: list[int],
+    ):
+        return None
+
+    def extract_tool_calls(self, model_output: str, request):
+        return None
+
+
+def test_parse_tool_calls_from_content_allows_named_tool_choice_with_none_content():
+    request = ChatCompletionRequest.model_validate(
+        {
+            "model": "test-model",
+            "messages": [{"role": "user", "content": "test"}],
+            "tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "parameters": {"type": "object", "properties": {}},
+                    },
+                }
+            ],
+            "tool_choice": {"type": "function", "function": {"name": "get_weather"}},
+        }
+    )
+
+    tool_calls, content = OpenAIServing._parse_tool_calls_from_content(
+        request=request,
+        tokenizer=None,
+        enable_auto_tools=True,
+        tool_parser_cls=None,
+        content=None,
+    )
+
+    assert content is None
+    assert tool_calls is not None
+    assert tool_calls == []
+
+
+def test_responses_parser_allows_named_tool_choice_with_none_content():
+    request = ResponsesRequest.model_validate(
+        {
+            "model": "test-model",
+            "input": "test",
+            "tools": [
+                {
+                    "type": "function",
+                    "name": "get_weather",
+                    "parameters": {"type": "object", "properties": {}},
+                }
+            ],
+            "tool_choice": {"type": "function", "name": "get_weather"},
+        }
+    )
+    parser = _DummyDelegatingParser(tokenizer=None)
+
+    tool_calls, content = parser._parse_tool_calls(
+        request=request,
+        content=None,
+        enable_auto_tools=False,
+    )
+
+    assert content is None
+    assert tool_calls == []
diff --git a/tests/entrypoints/pooling/test_utils.py b/tests/entrypoints/pooling/test_utils.py
new file mode 100644
index 000000000000..13a89f2520ec
--- /dev/null
+++ b/tests/entrypoints/pooling/test_utils.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import importlib.util
+import json
+import warnings
+from types import SimpleNamespace
+
+import numpy as np
+import pytest
+import torch
+
+from vllm.entrypoints.pooling.utils import encode_pooling_output_float_or_ndarray
+
+
+def _pooling_output(data):
+    return SimpleNamespace(outputs=SimpleNamespace(data=data))
+
+
+def test_encode_pooling_output_float_or_ndarray_returns_numpy_array():
+    output = _pooling_output(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32))
+
+    encoded = encode_pooling_output_float_or_ndarray(output)
+
+    assert isinstance(encoded, np.ndarray)
+    np.testing.assert_allclose(encoded, [1.0, 2.0, 3.0])
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("orjson") is None,
+    reason="orjson is not installed",
+)
+def test_orjson_serializes_numpy_array():
+    from fastapi.responses import ORJSONResponse
+
+    output = _pooling_output(torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32))
+    encoded = encode_pooling_output_float_or_ndarray(output)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", DeprecationWarning)
+        response = ORJSONResponse(content={"embedding": encoded})
+    assert json.loads(response.body)["embedding"] == pytest.approx([1.0, 2.0, 3.0])
+
+
+def test_encode_pooling_output_float_or_ndarray_falls_back_to_list():
+    class DataWithUnsupportedNumpy:
+        def is_contiguous(self):
+            return True
+
+        def numpy(self):
+            raise TypeError("unsupported dtype")
+
+        def tolist(self):
+            return [1.0, 2.0, 3.0]
+
+    output = _pooling_output(DataWithUnsupportedNumpy())
+
+    assert encode_pooling_output_float_or_ndarray(output) == [1.0, 2.0, 3.0]
diff --git a/tests/ir/test_inplace_op.py b/tests/ir/test_inplace_op.py
new file mode 100644
index 000000000000..decc4f51c777
--- /dev/null
+++ b/tests/ir/test_inplace_op.py
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import Tensor
+from torch.fx.experimental.proxy_tensor import make_fx
+
+import vllm.ir.op
+from vllm.ir.op import IrOp, IrOpInplaceOverload
+
+
+@vllm.ir.register_op(allow_inplace=True)
+def _custom_mm2(x: Tensor, w: Tensor) -> Tensor:
+    return x @ w
+
+
+@_custom_mm2.register_impl("regular")
+def _custom_mm2_regular(x: Tensor, w: Tensor) -> Tensor:
+    return x @ w + 1
+
+
+@_custom_mm2.register_impl("inplace", inplace=True)
+def _custom_mm2_inplace(x: Tensor, w: Tensor) -> Tensor:
+    x.copy_(x @ w + 2)
+    return x
+
+
+class TestInplaceOp:
+    def test_registration(self):
+        # Test that the inplace op is registered correctly.
+        assert "_custom_mm2" in IrOp.registry
+        assert IrOp.registry["_custom_mm2"] is _custom_mm2
+        assert _custom_mm2.torch_op is torch.ops.vllm_ir._custom_mm2.default
+        assert isinstance(_custom_mm2.maybe_inplace, IrOpInplaceOverload)
+        assert (
+            _custom_mm2.maybe_inplace.torch_op
+            is torch.ops.vllm_ir._custom_mm2.maybe_inplace
+        )
+
+    def test_inplace_dispatching(self):
+        # check that the correct implementation is dispatched based on priority,
+        # and inplace semantics hold
+        w = torch.randn(3, 3)
+        x = torch.randn(2, 3)
+        x1 = x.clone()
+
+        with _custom_mm2.set_priority(["regular"]):
+            result_regular = _custom_mm2.maybe_inplace(x, w)
+
+        # check that the regular op does not modify x
+        torch.testing.assert_close(x, x1, atol=0, rtol=0)
+
+        with _custom_mm2.set_priority(["inplace"]):
+            result_inplace: Tensor = _custom_mm2.maybe_inplace(x, w)
+
+        # check that the inplace op returns x directly
+        assert result_inplace.data_ptr() == x.data_ptr()
+
+        torch.testing.assert_close(result_inplace, x1 @ w + 2)
+        torch.testing.assert_close(result_regular, x1 @ w + 1)
+
+    def test_default_dispatching(self):
+        # check that the correct implementation is dispatched,
+        # and ops do not modify inputs when using the default overload
+        w = torch.randn(3, 3)
+        x = torch.randn(2, 3)
+        x1 = x.clone()
+
+        with _custom_mm2.set_priority(["regular"]):
+            result_regular = _custom_mm2(x, w)
+
+        with _custom_mm2.set_priority(["inplace"]):
+            result_inplace = _custom_mm2(x, w)
+
+        # check that x was not modified by either impl
+        torch.testing.assert_close(x, x1, atol=0, rtol=0)
+
+        torch.testing.assert_close(result_inplace, x1 @ w + 2)
+        torch.testing.assert_close(result_regular, x1 @ w + 1)
+
+    def test_trace(self):
+        # Test that the inplace op can be used in a graph.
+        def func(x: Tensor, y: Tensor) -> Tensor:
+            return _custom_mm2.maybe_inplace(x, y)
+
+        x = torch.randn(2, 3)
+        y = torch.randn(3, 4)
+        graph = make_fx(func)(x, y)
+        assert any(
+            node.target == torch.ops.vllm_ir._custom_mm2.maybe_inplace
+            for node in graph.graph.nodes
+        )
diff --git a/tests/ir/test_op.py b/tests/ir/test_op.py
index 524497916b6c..3576e5aef8bd 100644
--- a/tests/ir/test_op.py
+++ b/tests/ir/test_op.py
@@ -21,7 +21,7 @@ class CustomError(Exception):
     pass
 
 
-@vllm.ir.register_op
+@vllm.ir.register_op(allow_inplace=True)
 def _custom_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return x + y
 
@@ -129,11 +129,15 @@ def test_schema_contains_tensor_signature(self):
 
     @pytest.mark.parametrize("enable_torch_wrap", [True, False])
     @pytest.mark.parametrize("symbolic_trace", [True, False])
+    @pytest.mark.parametrize("overload", ["default", "maybe_inplace"])
     def test_trace_sees_single_custom_op(
-        self, symbolic_trace: bool, enable_torch_wrap: bool
+        self, symbolic_trace: bool, enable_torch_wrap: bool, overload: str
     ):
+        op_fn = _custom_add if overload == "default" else _custom_add.maybe_inplace
+        torch_op = getattr(torch.ops.vllm_ir._custom_add, overload)
+
         def fn(x, y):
-            return _custom_add(x, y)
+            return op_fn(x, y)
 
         def find_fn(target: Any, gm: fx.GraphModule):
             return gm.graph.find_nodes(op="call_function", target=target)
@@ -155,7 +159,7 @@ def find_fn(target: Any, gm: fx.GraphModule):
         torch.testing.assert_close(out_fx, out_eager)
 
         # check that IR nodes only appear if enable_torch_wrap=True
-        ir_nodes = find_fn(torch.ops.vllm_ir._custom_add.default, gm)
+        ir_nodes = find_fn(torch_op, gm)
         if enable_torch_wrap:
             assert len(ir_nodes) == 1, gm.code
         else:
@@ -167,7 +171,7 @@ def find_fn(target: Any, gm: fx.GraphModule):
         else:
             gm = make_fx(fn)(torch.randn(2, 2), torch.randn(2, 2))
 
-        ir_nodes = find_fn(torch.ops.vllm_ir._custom_add.default, gm)
+        ir_nodes = find_fn(torch_op, gm)
         assert len(ir_nodes) == 1, gm.code
 
 
@@ -176,9 +180,12 @@ def impl_a(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     return x + y + 10
 
 
-@_custom_add.register_impl("impl_b")
+@_custom_add.register_impl("impl_b", inplace=True)
 def impl_b(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
-    return x + y + 20
+    """Computes x+y+20"""
+    x.add_(y)
+    x.add_(20)
+    return x
 
 
 @_custom_add.register_impl("impl_even", supports_args=lambda x, y: x.size(1) % 2 == 0)
@@ -243,19 +250,23 @@ def test_set_priority_scoped(self):
         # Restored to empty
         assert _custom_add.get_priority() == []
 
-    def test_dispatch_priority_order(self):
+    @pytest.mark.parametrize("overload", ["default", "maybe_inplace"])
+    def test_dispatch_priority_order(self, overload: str):
+        op_fn = _custom_add if overload == "default" else _custom_add.maybe_inplace
+        torch_op = getattr(torch.ops.vllm_ir._custom_add, overload)
+
         x = torch.tensor(1, dtype=torch.int32)
         y = torch.tensor(2, dtype=torch.int32)
 
         with _custom_add.set_priority(["impl_b", "impl_a"]):
             assert _custom_add.dispatch(x, y) is impl_b
-            out1 = _custom_add(x, y)
-            out2 = torch.ops.vllm_ir._custom_add(x, y)
+            out1 = op_fn(x.clone(), y)
+            out2 = torch_op(x.clone(), y)
 
             with _custom_add.set_priority(["impl_a"]):
                 assert _custom_add.dispatch(x, y) is impl_a
-                out3 = _custom_add(x, y)
-                out4 = torch.ops.vllm_ir._custom_add(x, y)
+                out3 = op_fn(x.clone(), y)
+                out4 = torch_op(x.clone(), y)
 
         # impl_b
         assert out1.item() == 1 + 2 + 20
@@ -265,18 +276,18 @@ def test_dispatch_priority_order(self):
         assert out4.item() == 1 + 2 + 10
 
     def test_unsupported_impl_filtered(self):
-        @_custom_add.register_impl("unsupported", supported=False)
-        def impl_bad(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        @_custom_add.register_impl("impl_unsupported", supported=False)
+        def impl_unsupported(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             return x + y + 999
 
         x = torch.tensor(1, dtype=torch.int32)
         y = torch.tensor(2, dtype=torch.int32)
 
-        with _custom_add.set_priority(["unsupported", "impl_a"]):
+        with _custom_add.set_priority(["impl_unsupported", "impl_a"]):
             assert _custom_add.get_priority() == ["impl_a"]
             out = _custom_add(x, y)
 
-        # impl_bad skipped → impl_a
+        # impl_unsupported skipped → impl_a
         assert out.item() == 1 + 2 + 10
 
     def test_supports_args_runtime_dispatch_and_warning(
diff --git a/tests/kernels/helion/helpers.py b/tests/kernels/helion/helpers.py
index dbe553be5589..f25c0a274aaa 100644
--- a/tests/kernels/helion/helpers.py
+++ b/tests/kernels/helion/helpers.py
@@ -6,24 +6,26 @@
 from collections.abc import Callable
 from contextlib import contextmanager
 from pathlib import Path
+from typing import Any
 from unittest.mock import patch
 
 import helion
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.kernels.helion.config_manager import ConfigManager
 from vllm.kernels.helion.register import register_kernel
 from vllm.kernels.helion.utils import get_canonical_gpu_name
 
 GPU_PLATFORM = get_canonical_gpu_name()
 
-DEFAULT_CONFIGS: dict[str, helion.Config] = {
-    "default": helion.Config(block_sizes=[32]),
+DEFAULT_CONFIGS: dict[CaseKey, helion.Config] = {
+    CaseKey.default(): helion.Config(block_sizes=[32]),
 }
 
 
 @contextmanager
 def dummy_kernel_registry(
-    configs: dict[str, helion.Config] | None = None,
+    configs: dict[CaseKey, helion.Config] | None = None,
 ):
     """Context manager providing a register function with automatic config setup.
 
@@ -34,7 +36,13 @@ def dummy_kernel_registry(
     """
     if configs is None:
         configs = DEFAULT_CONFIGS
-    config_data = {k: v.__dict__["config"] for k, v in configs.items()}
+
+    def _to_config_entries(cfgs: dict) -> list[dict[str, Any]]:
+        pairs: list[dict[str, Any]] = []
+        for k, v in cfgs.items():
+            config_data = v.__dict__["config"]
+            pairs.append({"key": dict(k), "config": config_data})
+        return pairs
 
     with tempfile.TemporaryDirectory() as tmpdir:
         config_dir = Path(tmpdir)
@@ -55,7 +63,7 @@ def decorator(fn: Callable) -> Callable:
                     kernel_dir = config_dir / name
                     kernel_dir.mkdir(parents=True, exist_ok=True)
                     (kernel_dir / f"{GPU_PLATFORM}.json").write_text(
-                        json.dumps(config_data)
+                        json.dumps(_to_config_entries(configs))
                     )
                     return register_kernel(op_name, **kwargs)(fn)
 
diff --git a/tests/kernels/helion/test_autotune.py b/tests/kernels/helion/test_autotune.py
index 87f06c43581e..8b42e145d484 100644
--- a/tests/kernels/helion/test_autotune.py
+++ b/tests/kernels/helion/test_autotune.py
@@ -63,7 +63,7 @@ def test_autotune_disabled_kernel_produces_valid_config(self):
         with dummy_kernel_registry(configs={}) as register:
             wrapper = register(
                 "autotune_test_kernel",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 fake_impl=lambda *a, **kw: None,
                 input_generator=lambda: {
                     "small": (
diff --git a/tests/kernels/helion/test_case_key.py b/tests/kernels/helion/test_case_key.py
new file mode 100644
index 000000000000..335902fd9ef6
--- /dev/null
+++ b/tests/kernels/helion/test_case_key.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.utils.import_utils import has_helion
+
+if not has_helion():
+    pytest.skip(
+        "Helion is not installed. Install with: pip install vllm[helion]",
+        allow_module_level=True,
+    )
+
+from vllm.kernels.helion.case_key import CaseKey
+
+
+class TestCaseKey:
+    """Test suite for CaseKey class."""
+
+    def test_construction_with_dict(self):
+        key = CaseKey({"intermediate": 2048, "numtokens": 256})
+        assert key["intermediate"] == 2048
+        assert key["numtokens"] == 256
+
+    def test_empty_construction_raises(self):
+        with pytest.raises(TypeError, match="at least one key-value pair"):
+            CaseKey()
+        with pytest.raises(TypeError, match="at least one key-value pair"):
+            CaseKey({})
+
+    def test_default_construction(self):
+        key = CaseKey.default()
+        assert len(key) == 0
+        assert key.is_default()
+
+    def test_non_default_is_not_default(self):
+        key = CaseKey({"intermediate": 2048})
+        assert not key.is_default()
+
+    def test_hashable_and_equality(self):
+        a = CaseKey({"intermediate": 2048, "numtokens": 256})
+        b = CaseKey({"numtokens": 256, "intermediate": 2048})
+        assert a == b
+        assert hash(a) == hash(b)
+        assert a != CaseKey({"intermediate": 4096})
+        assert CaseKey.default() == CaseKey.default()
+
+        configs = {
+            CaseKey.default(): "default_config",
+            a: "a_config",
+        }
+        assert configs[b] == "a_config"
+        assert configs[CaseKey.default()] == "default_config"
+
+    def test_str_is_sorted_json(self):
+        assert str(CaseKey({"z": 1, "a": 2})) == '{"a":2,"z":1}'
+        assert str(CaseKey.default()) == "{}"
+
+    def test_immutable(self):
+        key = CaseKey({"intermediate": 2048})
+        with pytest.raises(TypeError, match="immutable"):
+            key["intermediate"] = 4096
+        with pytest.raises(TypeError, match="immutable"):
+            del key["intermediate"]
+        with pytest.raises(TypeError, match="immutable"):
+            key.update({"numtokens": 256})
+        with pytest.raises(TypeError, match="immutable"):
+            key.clear()
diff --git a/tests/kernels/helion/test_config_manager.py b/tests/kernels/helion/test_config_manager.py
index 337696ee066b..f8e5eae6f106 100644
--- a/tests/kernels/helion/test_config_manager.py
+++ b/tests/kernels/helion/test_config_manager.py
@@ -23,6 +23,7 @@
 
 import helion
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.kernels.helion.config_manager import (
     ConfigManager,
     ConfigSet,
@@ -49,22 +50,25 @@ def test_config_set_creation(self):
 
     def test_config_set_from_dict(self):
         """Test creating ConfigSet from dictionary data."""
-        # Use realistic config data that helion.Config can handle
         config_data = {
             "block_sizes": [32, 16],
             "num_warps": 4,
             "num_stages": 3,
             "pid_type": "persistent_interleaved",
         }
-        data = {"h100": {"batch_32_hidden_4096": config_data}}
+        data = {
+            "h100": [
+                {"key": {"batch": 32, "hidden": 4096}, "config": config_data},
+            ]
+        }
 
         config_set = ConfigSet.from_dict("test_kernel", data)
 
         assert config_set.kernel_name == "test_kernel"
         assert config_set.get_platforms() == ["h100"]
 
-        # Verify the config was created correctly
-        config = config_set.get_config("h100", "batch_32_hidden_4096")
+        internal_key = CaseKey({"batch": 32, "hidden": 4096})
+        config = config_set.get_config("h100", internal_key)
         assert isinstance(config, helion.Config)
         assert config.block_sizes == [32, 16]
         assert config.num_warps == 4
@@ -76,17 +80,19 @@ def test_config_set_get_config_keyerror(self):
         config_set = ConfigSet("test_kernel")
 
         with pytest.raises(KeyError, match="platform 'h100' not found"):
-            config_set.get_config("h100", "batch_32_hidden_4096")
+            config_set.get_config("h100", "nonexistent")
 
-        # Use realistic config data
         config_data = {"num_warps": 8, "num_stages": 4}
-        data = {"h100": {"batch_64_hidden_2048": config_data}}
+        data = {
+            "h100": [
+                {"key": {"batch": 64, "hidden": 2048}, "config": config_data},
+            ]
+        }
         config_set = ConfigSet.from_dict("test_kernel", data)
 
-        with pytest.raises(
-            KeyError, match="config_key 'batch_32_hidden_4096' not found"
-        ):
-            config_set.get_config("h100", "batch_32_hidden_4096")
+        nonexistent_key = CaseKey({"batch": 32, "hidden": 4096})
+        with pytest.raises(KeyError, match="config_key .* not found"):
+            config_set.get_config("h100", nonexistent_key)
 
     def test_config_set_get_platforms(self):
         """Test get_platforms method."""
@@ -95,8 +101,12 @@ def test_config_set_get_platforms(self):
         config2 = {"num_warps": 8, "num_stages": 5}
 
         data = {
-            "h100": {"batch_32_hidden_4096": config1},
-            "a100": {"batch_16_hidden_2048": config2},
+            "h100": [
+                {"key": {"batch": 32, "hidden": 4096}, "config": config1},
+            ],
+            "a100": [
+                {"key": {"batch": 16, "hidden": 2048}, "config": config2},
+            ],
         }
         config_set = ConfigSet.from_dict("test_kernel", data)
 
@@ -105,39 +115,49 @@ def test_config_set_get_platforms(self):
 
     def test_config_set_get_config_keys(self):
         """Test get_config_keys method."""
-        # Use realistic config data
         config1 = {"num_warps": 4, "num_stages": 3}
         config2 = {"num_warps": 8, "num_stages": 5}
 
         data = {
-            "h100": {
-                "batch_32_hidden_4096": config1,
-                "batch_64_hidden_2048": config2,
-            }
+            "h100": [
+                {"key": {"batch": 32, "hidden": 4096}, "config": config1},
+                {"key": {"batch": 64, "hidden": 2048}, "config": config2},
+            ]
         }
         config_set = ConfigSet.from_dict("test_kernel", data)
 
         config_keys = config_set.get_config_keys("h100")
-        assert config_keys == ["batch_32_hidden_4096", "batch_64_hidden_2048"]
+        expected_keys = sorted(
+            [
+                CaseKey({"batch": 32, "hidden": 4096}),
+                CaseKey({"batch": 64, "hidden": 2048}),
+            ],
+            key=lambda k: str(k) if k is not None else "",
+        )
+        assert config_keys == expected_keys
 
         assert config_set.get_config_keys("v100") == []
 
     def test_config_set_to_dict(self):
         """Test converting ConfigSet to dictionary."""
-        # Use realistic config data
         original_config = {
             "block_sizes": [64, 32],
             "num_warps": 16,
             "num_stages": 4,
             "pid_type": "persistent_blocked",
         }
-        original_data = {"h100": {"batch_32_hidden_4096": original_config}}
+        original_data = {
+            "h100": [
+                {"key": {"batch": 32, "hidden": 4096}, "config": original_config},
+            ]
+        }
 
         config_set = ConfigSet.from_dict("test_kernel", original_data)
         result_data = config_set.to_dict()
 
-        # The result should match the original (Config roundtrip should work)
-        assert result_data == original_data
+        internal_key = CaseKey({"batch": 32, "hidden": 4096})
+        assert internal_key in result_data["h100"]
+        assert result_data["h100"][internal_key] == original_config
 
 
 class TestConfigManager:
@@ -202,7 +222,10 @@ def test_load_config_set_valid_file(self):
             kernel_dir.mkdir()
             platform_file = kernel_dir / "h100.json"
             with open(platform_file, "w") as f:
-                json.dump({"batch_32_hidden_4096": kernel_config}, f)
+                json.dump(
+                    [{"key": {"batch": 32, "hidden": 4096}, "config": kernel_config}],
+                    f,
+                )
 
             manager = ConfigManager(base_dir=temp_dir)
             config_set = manager.load_config_set("test_kernel")
@@ -211,7 +234,8 @@ def test_load_config_set_valid_file(self):
             assert config_set.kernel_name == "test_kernel"
             assert config_set.get_platforms() == ["h100"]
 
-            config = config_set.get_config("h100", "batch_32_hidden_4096")
+            internal_key = CaseKey({"batch": 32, "hidden": 4096})
+            config = config_set.get_config("h100", internal_key)
             assert isinstance(config, helion.Config)
             assert config.block_sizes == [128, 64]
             assert config.num_warps == 8
@@ -241,7 +265,11 @@ def test_save_config_set(self):
                 "num_stages": 8,
                 "pid_type": "persistent_blocked",
             }
-            data = {"h100": {"batch_32_hidden_4096": kernel_config}}
+            data = {
+                "h100": [
+                    {"key": {"batch": 32, "hidden": 4096}, "config": kernel_config},
+                ]
+            }
             config_set = ConfigSet.from_dict("test_kernel", data)
 
             manager = ConfigManager(base_dir=temp_dir)
@@ -255,13 +283,21 @@ def test_save_config_set(self):
             assert platform_file.exists()
             with open(platform_file) as f:
                 loaded_data = json.load(f)
-            assert loaded_data == data["h100"]
+            assert isinstance(loaded_data, list)
+            assert len(loaded_data) == 1
+            entry = loaded_data[0]
+            assert entry["key"] == {"batch": 32, "hidden": 4096}
+            assert entry["config"] == kernel_config
 
     def test_save_config_set_creates_directory(self):
         """Test that save_config_set creates parent directories if needed."""
         with tempfile.TemporaryDirectory() as temp_dir:
             nested_dir = Path(temp_dir) / "nested" / "configs"
-            data = {"h100": {"default": {"num_warps": 4}}}
+            data = {
+                "h100": [
+                    {"key": {}, "config": {"num_warps": 4}},
+                ]
+            }
             config_set = ConfigSet.from_dict("test_kernel", data)
 
             manager = ConfigManager(base_dir=nested_dir)
@@ -288,34 +324,41 @@ def test_get_platform_configs(self):
             kernel_dir.mkdir()
             with open(kernel_dir / "h100.json", "w") as f:
                 json.dump(
-                    {
-                        "batch_32_hidden_4096": config_1,
-                        "batch_64_hidden_2048": config_2,
-                        "default": default_config,
-                    },
+                    [
+                        {"key": {"batch": 32, "hidden": 4096}, "config": config_1},
+                        {"key": {"batch": 64, "hidden": 2048}, "config": config_2},
+                        {"key": {}, "config": default_config},
+                    ],
                     f,
                 )
             with open(kernel_dir / "a100.json", "w") as f:
-                json.dump({"batch_16_hidden_1024": config_3}, f)
+                json.dump(
+                    [{"key": {"batch": 16, "hidden": 1024}, "config": config_3}],
+                    f,
+                )
 
             manager = ConfigManager(base_dir=temp_dir)
 
+            key_b32_h4096 = CaseKey({"batch": 32, "hidden": 4096})
+            key_b64_h2048 = CaseKey({"batch": 64, "hidden": 2048})
+            key_b16_h1024 = CaseKey({"batch": 16, "hidden": 1024})
+
             h100_configs = manager.get_platform_configs("test_kernel", "h100")
             assert len(h100_configs) == 3
-            assert "batch_32_hidden_4096" in h100_configs
-            assert "batch_64_hidden_2048" in h100_configs
-            assert "default" in h100_configs
+            assert key_b32_h4096 in h100_configs
+            assert key_b64_h2048 in h100_configs
+            assert CaseKey.default() in h100_configs
             for config in h100_configs.values():
                 assert isinstance(config, helion.Config)
 
-            assert h100_configs["batch_32_hidden_4096"].num_warps == 4
-            assert h100_configs["default"].num_stages == 7
+            assert h100_configs[key_b32_h4096].num_warps == 4
+            assert h100_configs[CaseKey.default()].num_stages == 7
 
             a100_configs = manager.get_platform_configs("test_kernel", "a100")
             assert len(a100_configs) == 1
-            assert "batch_16_hidden_1024" in a100_configs
-            assert isinstance(a100_configs["batch_16_hidden_1024"], helion.Config)
-            assert a100_configs["batch_16_hidden_1024"].num_warps == 2
+            assert key_b16_h1024 in a100_configs
+            assert isinstance(a100_configs[key_b16_h1024], helion.Config)
+            assert a100_configs[key_b16_h1024].num_warps == 2
 
             nonexistent_configs = manager.get_platform_configs("test_kernel", "v100")
             assert len(nonexistent_configs) == 0
diff --git a/tests/kernels/helion/test_register.py b/tests/kernels/helion/test_register.py
index bad3017c5c96..c82c3c8358ed 100644
--- a/tests/kernels/helion/test_register.py
+++ b/tests/kernels/helion/test_register.py
@@ -24,6 +24,7 @@
 import helion.language as hl
 
 from tests.kernels.helion.helpers import dummy_kernel_registry
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.kernels.helion.config_manager import ConfigManager
 from vllm.kernels.helion.register import (
     _HOP_AVAILABLE,
@@ -54,22 +55,22 @@ def _add_kernel(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
 def sample_configs():
     """Create real Helion config objects for testing."""
     return {
-        "hiddensize_4096_batchsize_32": helion.Config(
+        CaseKey({"batchsize": 32, "hiddensize": 4096}): helion.Config(
             block_sizes=[128],
             num_warps=4,
             num_stages=3,
         ),
-        "hiddensize_4096_batchsize_64": helion.Config(
+        CaseKey({"batchsize": 64, "hiddensize": 4096}): helion.Config(
             block_sizes=[256],
             num_warps=8,
             num_stages=4,
         ),
-        "hiddensize_4096_batchsize_128": helion.Config(
+        CaseKey({"batchsize": 128, "hiddensize": 4096}): helion.Config(
             block_sizes=[512],
             num_warps=16,
             num_stages=2,
         ),
-        "default": helion.Config(
+        CaseKey.default(): helion.Config(
             block_sizes=[64],
             num_warps=2,
             num_stages=2,
@@ -101,8 +102,7 @@ def configured_kernel(sample_kernel, sample_configs, config_manager_with_test_co
     """Create a ConfiguredHelionKernel for testing."""
 
     def test_config_picker(args, config_keys):
-        """Simple config picker that returns default."""
-        return "default"
+        return None
 
     with (
         patch(
@@ -115,7 +115,6 @@ def test_config_picker(args, config_keys):
         ),
         patch("vllm.kernels.helion.register.helion.kernel") as mock_kernel,
     ):
-        # Mock just the helion.kernel decorator to avoid actual kernel compilation
         mock_decorated = Mock()
         mock_kernel.return_value = Mock(return_value=mock_decorated)
 
@@ -199,7 +198,9 @@ class TestConfiguredHelionKernel:
 
     def test_init_raises_without_picker(self, sample_kernel, sample_configs):
         """Test that __init__ raises when no picker registered."""
-        configs = {"default": sample_configs["default"]}
+        configs: dict[CaseKey, helion.Config] = {
+            CaseKey.default(): sample_configs[CaseKey.default()]
+        }
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value=configs)
 
@@ -227,7 +228,7 @@ def test_config_selector_validates_picker_result(
         """Test that config selector validates picker returns valid key."""
 
         def invalid_picker(args, config_keys):
-            return "invalid_key"
+            return {"invalid": 999}
 
         kernel = create_configured_kernel_with_configs(
             op_name="test_kernel",
@@ -263,7 +264,7 @@ def none_picker(args, config_keys):
         selector = kernel._create_config_selector(key_computer)
 
         result = selector((torch.randn(32, 4096),))
-        assert result is kernel.configs["default"]
+        assert result is kernel.configs[CaseKey.default()]
 
     def test_create_decorated_kernel_passes_helion_settings(
         self, sample_kernel, sample_configs
@@ -271,7 +272,7 @@ def test_create_decorated_kernel_passes_helion_settings(
         """Test that _create_decorated_kernel passes helion_settings."""
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         settings = helion.Settings()
         settings.print_output_code = True
@@ -315,10 +316,10 @@ def tracking_picker(args, config_keys):
             x = args[0]
             batch_size = x.shape[0]
             if batch_size <= 32:
-                return "hiddensize_4096_batchsize_32"
+                return CaseKey({"batchsize": 32, "hiddensize": 4096})
             elif batch_size <= 64:
-                return "hiddensize_4096_batchsize_64"
-            return "hiddensize_4096_batchsize_128"
+                return CaseKey({"batchsize": 64, "hiddensize": 4096})
+            return CaseKey({"batchsize": 128, "hiddensize": 4096})
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value=sample_configs)
@@ -350,13 +351,13 @@ def tracking_picker(args, config_keys):
 
             tensor = torch.randn(50, 4096)  # batch=50, should select batchsize_64
 
-            # key receives unpacked args, autotuner receives args as tuple
             key_result = key_fn(tensor)
             autotuner = autotuner_fn(None, (tensor,))
             config = autotuner.autotune()
 
-            assert key_result == "hiddensize_4096_batchsize_64"
-            assert config is kernel.configs["hiddensize_4096_batchsize_64"]
+            expected_key = CaseKey({"batchsize": 64, "hiddensize": 4096})
+            assert key_result == str(expected_key)
+            assert config is kernel.configs[expected_key]
 
 
 class TestHelionKernelWrapper:
@@ -369,7 +370,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(
@@ -406,7 +407,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value={})
@@ -441,7 +442,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value={})
@@ -476,7 +477,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         expected_inputs = {"key1": (torch.randn(4),)}
         input_gen = Mock(return_value=expected_inputs)
@@ -516,7 +517,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value={})
@@ -563,7 +564,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value=sample_configs)
@@ -601,7 +602,9 @@ def test_init_eagerly_initializes_hop_path(self):
         on the HOP path (no custom op registration needed)."""
         from vllm.kernels.helion.utils import get_canonical_gpu_name
 
-        configs = {"default": helion.Config(block_sizes=[4, 4])}
+        configs: dict[CaseKey, helion.Config] = {
+            CaseKey.default(): helion.Config(block_sizes=[4, 4])
+        }
         with (
             dummy_kernel_registry(configs=configs) as register,
             patch(
@@ -610,7 +613,7 @@ def test_init_eagerly_initializes_hop_path(self):
             ) as mock_gpu,
         ):
             wrapper = register(
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
             )(_add_kernel)
 
             mock_gpu.assert_called_once()
@@ -642,7 +645,7 @@ def test_init_eagerly_initializes(self):
             ) as mock_gpu,
         ):
             wrapper = register(
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
             )(_add_kernel)
 
             # Init must have detected GPU and built the kernel
@@ -660,7 +663,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value=sample_configs)
@@ -703,7 +706,7 @@ def fake_impl(*args, **kwargs):
             return torch.zeros_like(args[0])
 
         def default_picker(args, config_keys):
-            return "default"
+            return None
 
         mock_config_manager = Mock(spec=ConfigManager)
         mock_config_manager.get_platform_configs = Mock(return_value=sample_configs)
@@ -784,9 +787,9 @@ def test_get_registered_kernels_returns_copy(self):
     def test_get_kernel_by_name_returns_kernel(self):
         """Test get_kernel_by_name returns registered kernel."""
         with dummy_kernel_registry() as register:
-            wrapper = register(
-                "test_kernel", config_picker=lambda args, keys: "default"
-            )(_add_kernel)
+            wrapper = register("test_kernel", config_picker=lambda args, keys: None)(
+                _add_kernel
+            )
 
         from vllm.kernels.helion.register import _REGISTERED_KERNELS
 
@@ -809,7 +812,7 @@ def test_register_kernel_auto_generates_fake_impl(self):
             mock_fake = Mock()
             mock_infer.return_value = mock_fake
             wrapper = register(
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
             )(_add_kernel)
 
         mock_infer.assert_called_once_with(_add_kernel, None)
@@ -818,7 +821,7 @@ def test_register_kernel_auto_generates_fake_impl(self):
     def test_register_kernel_creates_wrapper(self):
         """Test register_kernel creates HelionKernelWrapper."""
         with dummy_kernel_registry() as register:
-            result = register("test_name", config_picker=lambda args, keys: "default")(
+            result = register("test_name", config_picker=lambda args, keys: None)(
                 _add_kernel
             )
 
@@ -829,16 +832,16 @@ def test_register_kernel_creates_wrapper(self):
     def test_register_kernel_auto_detects_name(self):
         """Test register_kernel uses function name when no name provided."""
         with dummy_kernel_registry() as register:
-            wrapper = register(config_picker=lambda args, keys: "default")(_add_kernel)
+            wrapper = register(config_picker=lambda args, keys: None)(_add_kernel)
 
         assert wrapper.op_name == "_add_kernel"
 
     def test_register_kernel_registers_in_global_registry(self):
         """Test register_kernel adds wrapper to global registry."""
         with dummy_kernel_registry() as register:
-            wrapper = register(
-                "test_kernel", config_picker=lambda args, keys: "default"
-            )(_add_kernel)
+            wrapper = register("test_kernel", config_picker=lambda args, keys: None)(
+                _add_kernel
+            )
 
         registered_kernels = get_registered_kernels()
         assert "test_kernel" in registered_kernels
@@ -852,7 +855,7 @@ def test_register_kernel_passes_helion_settings(self):
         with dummy_kernel_registry() as register:
             result = register(
                 "test_name",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 helion_settings=settings,
             )(_add_kernel)
 
@@ -865,7 +868,7 @@ def test_register_kernel_supports_decorator_syntax(self):
         with dummy_kernel_registry() as register:
             result = register(
                 "custom_name",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 fake_impl=mock_fake,
             )(_add_kernel)
 
@@ -875,12 +878,12 @@ def test_register_kernel_supports_decorator_syntax(self):
     def test_register_kernel_raises_on_duplicate_registration(self):
         """Test register_kernel raises error on duplicate names."""
         with dummy_kernel_registry() as register:
-            register("duplicate_name", config_picker=lambda args, keys: "default")(
+            register("duplicate_name", config_picker=lambda args, keys: None)(
                 _add_kernel
             )
 
             with pytest.raises(ValueError, match="already registered"):
-                register("duplicate_name", config_picker=lambda args, keys: "default")(
+                register("duplicate_name", config_picker=lambda args, keys: None)(
                     _add_kernel
                 )
 
@@ -893,7 +896,7 @@ def test_register_kernel_rejects_autotuner_fn_in_settings(self):
 
             @register_kernel(
                 "test",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 helion_settings=mock_settings,
             )
             def test_kernel(x):
@@ -910,7 +913,7 @@ def test_register_kernel_no_warning_with_static_shapes_false(self):
         ):
             register(
                 "test",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 helion_settings=mock_settings,
             )(_add_kernel)
 
@@ -940,7 +943,7 @@ def fake_impl(*args, **kwargs):
 
             wrapper = register_kernel(
                 "disabled_kernel",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 fake_impl=fake_impl,
             )(_add_kernel)
 
@@ -957,12 +960,14 @@ class TestTorchCompileHOP:
     def test_compiled_graph_contains_helion_hop(self):
         """Verify torch.compile on a HelionKernelWrapper emits a
         helion_kernel_wrapper_mutation HOP node in the FX graph."""
-        configs = {"default": helion.Config(block_sizes=[4, 4])}
+        configs: dict[CaseKey, helion.Config] = {
+            CaseKey.default(): helion.Config(block_sizes=[4, 4])
+        }
 
         with dummy_kernel_registry(configs=configs) as register:
             add_helion_kernel = register(
                 op_name="test_torch_compile_add_kernel",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
             )(_add_kernel)
 
         captured_graph: torch.fx.GraphModule | None = None
@@ -1013,12 +1018,14 @@ def f(x, y):
     def test_inductor_backend_compiles_helion_hop(self):
         """Test torch.compile with inductor backend and Helion fusion enabled."""
 
-        configs = {"default": helion.Config(block_sizes=[4, 4])}
+        configs: dict[CaseKey, helion.Config] = {
+            CaseKey.default(): helion.Config(block_sizes=[4, 4])
+        }
 
         with dummy_kernel_registry(configs=configs) as register:
             add_helion_kernel = register(
                 op_name="test_inductor_add_kernel",
-                config_picker=lambda args, keys: "default",
+                config_picker=lambda args, keys: None,
                 helion_settings=helion.Settings(
                     torch_compile_fusion=True, static_shapes=False
                 ),
diff --git a/tests/kernels/helion/test_silu_mul_fp8.py b/tests/kernels/helion/test_silu_mul_fp8.py
index 887f20b9f563..bd3131e08da2 100644
--- a/tests/kernels/helion/test_silu_mul_fp8.py
+++ b/tests/kernels/helion/test_silu_mul_fp8.py
@@ -13,8 +13,10 @@
         allow_module_level=True,
     )
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.kernels.helion.config_manager import ConfigManager
 from vllm.kernels.helion.ops.silu_mul_fp8 import (
+    _pick_cache,
     pick_silu_mul_fp8_config,
     silu_mul_fp8,
     silu_mul_fp8_baseline,
@@ -52,10 +54,13 @@ def reset_config_manager_singleton():
 
 
 class TestSiluMulFp8ConfigPicker:
+    def setup_method(self):
+        _pick_cache.clear()
+
     def test_config_picker_exact_match(self):
         config_keys = [
-            "intermediate_2048_numtokens_256",
-            "intermediate_4096_numtokens_256",
+            CaseKey({"intermediate": 2048, "numtokens": 256}),
+            CaseKey({"intermediate": 4096, "numtokens": 256}),
         ]
 
         input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
@@ -63,33 +68,22 @@ def test_config_picker_exact_match(self):
         args = (input_tensor, scale)
 
         selected_key = pick_silu_mul_fp8_config(args, config_keys)
-        assert selected_key == "intermediate_2048_numtokens_256"
+        assert selected_key == CaseKey({"intermediate": 2048, "numtokens": 256})
 
     def test_config_picker_closest_match(self):
         config_keys = [
-            "intermediate_2048_numtokens_256",
-            "intermediate_4096_numtokens_256",
+            CaseKey({"intermediate": 2048, "numtokens": 256}),
+            CaseKey({"intermediate": 4096, "numtokens": 256}),
         ]
-        # Use 7000 (intermediate_size=3500) which is closer to 4096 than 2048
         input_tensor = torch.randn(32, 7000, dtype=torch.bfloat16, device="cuda")
         scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
         args = (input_tensor, scale)
 
         selected_key = pick_silu_mul_fp8_config(args, config_keys)
-        assert selected_key == "intermediate_4096_numtokens_256"
-
-    def test_config_picker_fallback_to_default(self):
-        config_keys = ["default"]
-
-        input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
-        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
-        args = (input_tensor, scale)
-
-        selected_key = pick_silu_mul_fp8_config(args, config_keys)
-        assert selected_key == "default"
+        assert selected_key == CaseKey({"intermediate": 4096, "numtokens": 256})
 
     def test_config_picker_no_configs(self):
-        config_keys: list[str] = []
+        config_keys: list[dict] = []
 
         input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
         scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
@@ -101,9 +95,9 @@ def test_config_picker_no_configs(self):
     @pytest.mark.parametrize("intermediate_size", [2048, 4096, 5120])
     def test_config_picker_different_sizes(self, intermediate_size):
         config_keys = [
-            "intermediate_2048_numtokens_256",
-            "intermediate_4096_numtokens_256",
-            "intermediate_5120_numtokens_256",
+            CaseKey({"intermediate": 2048, "numtokens": 256}),
+            CaseKey({"intermediate": 4096, "numtokens": 256}),
+            CaseKey({"intermediate": 5120, "numtokens": 256}),
         ]
 
         input_tensor = torch.randn(
@@ -113,72 +107,47 @@ def test_config_picker_different_sizes(self, intermediate_size):
         args = (input_tensor, scale)
 
         selected_key = pick_silu_mul_fp8_config(args, config_keys)
-        expected_key = f"intermediate_{intermediate_size}_numtokens_256"
-        assert selected_key == expected_key
+        assert selected_key == {
+            "intermediate": intermediate_size,
+            "numtokens": 256,
+        }
 
     def test_config_picker_numtokens_ceiling(self):
-        """Pick the smallest numtokens >= input num_tokens."""
         config_keys = [
-            "intermediate_4096_numtokens_8",
-            "intermediate_4096_numtokens_32",
-            "intermediate_4096_numtokens_128",
-            "intermediate_4096_numtokens_256",
+            CaseKey({"intermediate": 4096, "numtokens": 8}),
+            CaseKey({"intermediate": 4096, "numtokens": 32}),
+            CaseKey({"intermediate": 4096, "numtokens": 128}),
+            CaseKey({"intermediate": 4096, "numtokens": 256}),
         ]
-        # 20 tokens -> should pick numtokens_32 (smallest >= 20)
         input_tensor = torch.randn(20, 8192, dtype=torch.bfloat16, device="cuda")
         scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
 
         selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys)
-        assert selected_key == "intermediate_4096_numtokens_32"
+        assert selected_key == CaseKey({"intermediate": 4096, "numtokens": 32})
 
     def test_config_picker_numtokens_exact(self):
-        """Exact num_tokens match is preferred over ceiling."""
         config_keys = [
-            "intermediate_4096_numtokens_8",
-            "intermediate_4096_numtokens_32",
-            "intermediate_4096_numtokens_128",
+            CaseKey({"intermediate": 4096, "numtokens": 8}),
+            CaseKey({"intermediate": 4096, "numtokens": 32}),
+            CaseKey({"intermediate": 4096, "numtokens": 128}),
         ]
         input_tensor = torch.randn(32, 8192, dtype=torch.bfloat16, device="cuda")
         scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
 
         selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys)
-        assert selected_key == "intermediate_4096_numtokens_32"
+        assert selected_key == CaseKey({"intermediate": 4096, "numtokens": 32})
 
     def test_config_picker_numtokens_fallback_to_largest(self):
-        """Fall back to the largest numtokens when input exceeds all."""
         config_keys = [
-            "intermediate_4096_numtokens_8",
-            "intermediate_4096_numtokens_32",
-            "intermediate_4096_numtokens_128",
+            CaseKey({"intermediate": 4096, "numtokens": 8}),
+            CaseKey({"intermediate": 4096, "numtokens": 32}),
+            CaseKey({"intermediate": 4096, "numtokens": 128}),
         ]
-        # 512 tokens -> exceeds all available, should pick largest (128)
         input_tensor = torch.randn(512, 8192, dtype=torch.bfloat16, device="cuda")
         scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
 
         selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys)
-        assert selected_key == "intermediate_4096_numtokens_128"
-
-    def test_config_picker_malformed_key_raises(self):
-        """Malformed config keys should raise ValueError."""
-        config_keys = ["intermediate_4096_badformat_256"]
-        input_tensor = torch.randn(32, 8192, dtype=torch.bfloat16, device="cuda")
-        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
-
-        with pytest.raises(ValueError, match="Malformed config key"):
-            pick_silu_mul_fp8_config((input_tensor, scale), config_keys)
-
-    def test_config_picker_default_ignored_when_valid_keys_exist(self):
-        """'default' is skipped in favor of a real match."""
-        config_keys = [
-            "default",
-            "intermediate_4096_numtokens_32",
-            "intermediate_4096_numtokens_128",
-        ]
-        input_tensor = torch.randn(64, 8192, dtype=torch.bfloat16, device="cuda")
-        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
-
-        selected_key = pick_silu_mul_fp8_config((input_tensor, scale), config_keys)
-        assert selected_key == "intermediate_4096_numtokens_128"
+        assert selected_key == CaseKey({"intermediate": 4096, "numtokens": 128})
 
 
 class TestSiluMulFp8Correctness:
diff --git a/tests/kernels/ir/test_layernorm.py b/tests/kernels/ir/test_layernorm.py
index 7510ae5010fa..e9661f5202f9 100644
--- a/tests/kernels/ir/test_layernorm.py
+++ b/tests/kernels/ir/test_layernorm.py
@@ -28,7 +28,9 @@ def test_rms_norm_registration():
         "native": True,
         "vllm_c": current_platform.is_cuda_alike(),
         "aiter": current_platform.is_rocm(),
-        "oink": False,
+        "oink": current_platform.has_device_capability(100)
+        and hasattr(torch.ops, "oink")
+        and hasattr(torch.ops.oink, "rmsnorm"),
         "xpu_kernels": current_platform.is_xpu(),
     }
 
@@ -67,6 +69,14 @@ def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
         out2 = rms_norm_native(x * 2.0, weight, epsilon=epsilon)
         torch.testing.assert_close(out2, out, rtol=get_default_rtol(out), atol=1e-3)
 
+        # Mean square should be approximately 1 (ignoring epsilon and weight scaling)
+        combined_norm = out.float() / weight.float()
+        variance = combined_norm.pow(2).mean(dim=-1)
+        # After RMS normalization, variance should be close to 1
+        torch.testing.assert_close(
+            variance, torch.ones_like(variance), rtol=1e-2, atol=1e-2
+        )
+
         # Check behavior with and without weight
         weight1 = torch.ones_like(weight)
         out3 = rms_norm_native(x, weight1, epsilon=epsilon)
@@ -129,3 +139,197 @@ def test_aiter_rejects_unsupported_dtypes():
             num_tokens=8, hidden_size=4096, dtype=dtype, epsilon=1e-5
         )
         assert not impl.supports_args(*args), f"aiter should reject dtype={dtype}"
+
+
+fused_add_rms_norm_native = ir.ops.fused_add_rms_norm.impls["native"].impl_fn
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
+)
+def test_fused_add_rms_norm_registration():
+    expected = {
+        "native": True,
+        "vllm_c": current_platform.is_cuda_alike(),
+        "aiter": current_platform.is_rocm(),
+        "oink": current_platform.has_device_capability(100)
+        and hasattr(torch.ops, "oink")
+        and hasattr(torch.ops.oink, "fused_add_rms_norm"),
+        "xpu_kernels": current_platform.is_xpu(),
+    }
+
+    actual = {
+        provider: impl.supported
+        for provider, impl in ir.ops.fused_add_rms_norm.impls.items()
+    }
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+@pytest.mark.parametrize("n_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", COMMON_HIDDEN_SIZES)
+@pytest.mark.parametrize("epsilon", [1e-6, 1e-5])
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike() and not current_platform.is_xpu(),
+    reason="Currently only kernels on CUDA, ROCm and XPU",
+)
+class TestFusedAddRMSNorm:
+    @classmethod
+    def setup_class(cls, **kwargs):
+        torch.set_default_device(current_platform.device_type)
+
+    def test_native_semantics(self, dtype, n_tokens, hidden_size, epsilon):
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=4, hidden_size=8, dtype=dtype, epsilon=epsilon
+        )
+        out, residual_out = fused_add_rms_norm_native(x, x_residual, weight, eps)
+
+        # Check shape, dtype, device
+        assert out.shape == x.shape
+        assert out.dtype == x.dtype
+        assert out.device == x.device
+        assert residual_out.shape == x_residual.shape
+        assert residual_out.dtype == x_residual.dtype
+        assert residual_out.device == x_residual.device
+
+        # Check that residual_out = x + x_residual
+        expected_residual = (x.float() + x_residual.float()).to(dtype)
+        torch.testing.assert_close(
+            residual_out, expected_residual, rtol=1e-3, atol=1e-3
+        )
+
+        # Verify that the output is RMS normalized version of (x + x_residual)
+        expected_out = rms_norm_native(expected_residual, weight, epsilon)
+        assert_close(
+            ir.ops.fused_add_rms_norm,
+            (out, residual_out),
+            (expected_out, expected_residual),
+        )
+
+        # Check the scaling property of rms norm
+        out1, _ = fused_add_rms_norm_native(
+            x, torch.zeros_like(x), weight, epsilon=epsilon
+        )
+        out2, _ = fused_add_rms_norm_native(
+            x * 2.0, torch.zeros_like(x), weight, epsilon=epsilon
+        )
+        torch.testing.assert_close(out2, out1, rtol=get_default_rtol(out), atol=1e-3)
+
+        # Check behavior with and without weight
+        weight1 = torch.ones_like(weight)
+        out3, _ = fused_add_rms_norm_native(x, x_residual, weight1, eps)
+        out4, _ = fused_add_rms_norm_native(x, x_residual, None, eps)
+        torch.testing.assert_close(out3, out4)
+
+    @pytest.mark.parametrize("provider", supported_providers(ir.ops.fused_add_rms_norm))
+    def test_impls(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        impl = ir.ops.fused_add_rms_norm.impls[provider]
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+        args = (x, x_residual, weight, eps, None)
+
+        if not impl.supports_args(*args):
+            pytest.skip(f"{provider} does not support args")
+
+        ref_output, ref_residual = fused_add_rms_norm_native(*clone_args(args))
+        output, residual = impl.impl_fn(*clone_args(args))
+        assert_close(ir.ops.fused_add_rms_norm, output, ref_output)
+        assert_close(ir.ops.fused_add_rms_norm, residual, ref_residual)
+
+        # check that dispatched call matches direct call
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_dispatched, residual_dispatched = ir.ops.fused_add_rms_norm(*args[:4])
+        out_direct, residual_direct = impl.impl_fn(*clone_args(args))
+        torch.testing.assert_close(out_dispatched, out_direct, rtol=0.0, atol=0.0)
+        torch.testing.assert_close(
+            residual_dispatched, residual_direct, rtol=0.0, atol=0.0
+        )
+
+        # none of these support variance_size override
+        assert not impl.supports_args(x, x_residual, weight, epsilon, 4)
+        assert not impl.supports_args(x, x_residual, weight, epsilon, variance_size=4)
+
+        # test weight=None behavior
+        out_no_weight, residual_no_weight = impl.impl_fn(
+            x.clone(), x_residual.clone(), None, epsilon
+        )
+        out_unit_weight, residual_unit_weight = impl.impl_fn(
+            x.clone(), x_residual.clone(), torch.ones_like(weight), epsilon
+        )
+        assert_close(ir.ops.fused_add_rms_norm, out_no_weight, out_unit_weight)
+        assert_close(
+            ir.ops.fused_add_rms_norm, residual_no_weight, residual_unit_weight
+        )
+
+    @pytest.mark.parametrize("provider", ["vllm_c"])
+    def test_inplace_semantics(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        """Test that inplace implementations reuse inputs,
+        for maybe_inplace overload but not for default overload."""
+        impl = ir.ops.fused_add_rms_norm.impls[provider]
+        if not impl.supported:
+            pytest.skip(f"{provider} impl not supported on this platform")
+
+        x, x_residual, weight, eps = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+
+        # Test default overload - should NOT modify inputs even with inplace impl
+        x_default = x.clone()
+        x_residual_default = x_residual.clone()
+        x_default_ptr = x_default.data_ptr()
+        x_residual_default_ptr = x_residual_default.data_ptr()
+
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_default, residual_default = ir.ops.fused_add_rms_norm(
+                x_default, x_residual_default, weight, eps
+            )
+
+        # Default should NOT be inplace (even with inplace implementation)
+        assert out_default.data_ptr() != x_default_ptr
+        assert residual_default.data_ptr() != x_residual_default_ptr
+        torch.testing.assert_close(x, x_default, rtol=0.0, atol=0.0)
+        torch.testing.assert_close(x_residual, x_residual_default, rtol=0.0, atol=0.0)
+
+        # Test maybe_inplace overload - should modify inputs with inplace impl
+        x_inplace = x.clone()
+        x_residual_inplace = x_residual.clone()
+        x_inplace_ptr = x_inplace.data_ptr()
+        x_residual_inplace_ptr = x_residual_inplace.data_ptr()
+
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            out_inplace, residual_inplace = ir.ops.fused_add_rms_norm.maybe_inplace(
+                x_inplace, x_residual_inplace, weight, eps
+            )
+
+        # maybe_inplace should be inplace
+        assert out_inplace.data_ptr() == x_inplace_ptr
+        assert residual_inplace.data_ptr() == x_residual_inplace_ptr
+
+        # Both should produce same results
+        torch.testing.assert_close(out_default, out_inplace, atol=0.0, rtol=0.0)
+        torch.testing.assert_close(
+            residual_default, residual_inplace, atol=0.0, rtol=0.0
+        )
+
+    @pytest.mark.parametrize("provider", supported_providers(ir.ops.fused_add_rms_norm))
+    def test_torch_opcheck(self, dtype, n_tokens, hidden_size, epsilon, provider):
+        args = ir.ops.fused_add_rms_norm.generate_inputs(
+            num_tokens=n_tokens, hidden_size=hidden_size, dtype=dtype, epsilon=epsilon
+        )
+        args = args + (None,)  # Add variance_size parameter
+
+        # When checking the torch op, we have to set priority and use dispatch
+        with ir.ops.fused_add_rms_norm.set_priority([provider, "native"]):
+            torch.library.opcheck(torch.ops.vllm_ir.fused_add_rms_norm.default, args)
+
+            # Only test maybe_inplace with non-inplace implementations
+            # Inplace implementations return aliases of inputs which is not allowed.
+            # We break this invariant, but we also convert maybe_inplace to the default
+            # overload during compilation, so maybe_inplace never reaches Inductor.
+            if not ir.ops.fused_add_rms_norm.impls[provider].inplace:
+                torch.library.opcheck(
+                    torch.ops.vllm_ir.fused_add_rms_norm.maybe_inplace, args
+                )
diff --git a/tests/kernels/moe/test_cpu_fp8_fused_moe.py b/tests/kernels/moe/test_cpu_fp8_fused_moe.py
new file mode 100644
index 000000000000..dbc6cd198c23
--- /dev/null
+++ b/tests/kernels/moe/test_cpu_fp8_fused_moe.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CPU FP8 W8A16 block-scaled fused MoE kernel."""
+
+import math
+import sys
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+import vllm._custom_ops as ops  # noqa: E402
+
+if not hasattr(torch.ops._C, "fused_experts_cpu"):
+    pytest.skip("fused_experts_cpu op not available", allow_module_level=True)
+
+
+BLOCK_SIZE = [128, 128]  # [block_n, block_k]
+
+# FP8 weight generation parameters
+_FP8_INFO = torch.finfo(torch.float8_e4m3fn)
+FP8_SCALE = _FP8_INFO.max  # 448.0
+FACTOR_FOR_SCALE = 1e-3
+
+# Tolerance for FP8 W8A16
+FP8_W8A16_ATOL = 1e-2
+FP8_W8A16_RTOL = 1e-2
+
+
+def _silu_and_mul(x: torch.Tensor) -> torch.Tensor:
+    d = x.shape[-1] // 2
+    return F.silu(x[..., :d]) * x[..., d:]
+
+
+def _block_dequant_weight(
+    weight: torch.Tensor,
+    scales: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Block-dequantize FP8 weight [E, N, K] → float [E, N, K].
+
+    Each (block_n × block_k) tile is multiplied by its per-block scale.
+    """
+    E, N, K = weight.shape
+    block_n, block_k = block_size
+    pad_N = (block_n - N % block_n) % block_n
+    pad_K = (block_k - K % block_k) % block_k
+
+    if pad_N > 0 or pad_K > 0:
+        weight = F.pad(weight, (0, pad_K, 0, pad_N))
+
+    n_tiles = math.ceil(N / block_n)
+    k_tiles = math.ceil(K / block_k)
+
+    weight_block = (
+        weight.view(E, n_tiles, block_n, k_tiles, block_k)
+        .permute(0, 1, 3, 2, 4)
+        .float()
+        .contiguous()
+    )
+    weight_scaled = (
+        (weight_block * scales.view(E, n_tiles, k_tiles, 1, 1))
+        .permute(0, 1, 3, 2, 4)
+        .contiguous()
+    )
+    if pad_N > 0 or pad_K > 0:
+        weight_scaled = weight_scaled.view(E, N + pad_N, K + pad_K)
+        weight_scaled = weight_scaled[..., :N, :K].contiguous()
+    else:
+        weight_scaled = weight_scaled.view(E, N, K)
+    return weight_scaled
+
+
+def ref_w8a16_block_fp8_moe(
+    a: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_s: torch.Tensor,
+    w2_s: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    block_size: list[int],
+) -> torch.Tensor:
+    """Reference FP8 W8A16 block-scaled fused MoE in pure torch.
+
+    Steps:
+      1. Block-dequant FP8 weights → float
+      2. For each expert: matmul → SiLU+Mul → matmul
+      3. Weighted sum across top-k experts
+    """
+    B, D = a.shape
+    topk = topk_ids.size(1)
+
+    w1_dq = _block_dequant_weight(w1, w1_s, block_size)
+    w2_dq = _block_dequant_weight(w2, w2_s, block_size)
+
+    a_exp = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D).float()
+    out = torch.zeros(B * topk, w2_dq.shape[1], dtype=torch.float32)
+
+    topk_weight_flat = topk_weight.view(-1)
+    topk_ids_flat = topk_ids.view(-1)
+
+    for i in range(w1_dq.shape[0]):
+        mask = topk_ids_flat == i
+        if mask.sum():
+            ic0 = torch.matmul(a_exp[mask], w1_dq[i].transpose(0, 1))
+            ic1 = _silu_and_mul(ic0)
+            out[mask] = torch.matmul(ic1, w2_dq[i].transpose(0, 1))
+
+    return (
+        (out.view(B, -1, w2_dq.shape[1]) * topk_weight_flat.view(B, -1, 1))
+        .sum(dim=1)
+        .to(a.dtype)
+    )
+
+
+def _make_fp8_moe_weights(
+    E: int,
+    N: int,
+    K: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Generate random FP8 MoE weights with random block scales.
+
+    Weight generation follows SGLang: ``randn * FP8_SCALE`` → clamp → cast.
+    Scales are small random values (``FACTOR_FOR_SCALE``), independent of
+    the actual weight magnitudes — this is sufficient to test the kernel's
+    block-dequant + matmul correctness.
+
+    Returns: (w1, w2, w1_s, w2_s)
+    """
+    block_n, block_k = block_size
+
+    w1 = (
+        (torch.randn(E, 2 * N, K) * FP8_SCALE)
+        .clamp(min=-FP8_SCALE, max=FP8_SCALE)
+        .to(torch.float8_e4m3fn)
+    )
+    w2 = (
+        (torch.randn(E, K, N) * FP8_SCALE)
+        .clamp(min=-FP8_SCALE, max=FP8_SCALE)
+        .to(torch.float8_e4m3fn)
+    )
+
+    w1_s = (
+        torch.randn(E, math.ceil(2 * N / block_n), math.ceil(K / block_k))
+        * FACTOR_FOR_SCALE
+    )
+    w2_s = (
+        torch.randn(E, math.ceil(K / block_n), math.ceil(N / block_k))
+        * FACTOR_FOR_SCALE
+    )
+    return w1, w2, w1_s, w2_s
+
+
+def _prepack_experts(w: torch.Tensor) -> torch.Tensor:
+    """VNNI-prepack each expert's weight via ``convert_weight_packed``."""
+    return torch.stack(
+        [torch.ops._C.convert_weight_packed(w[e]) for e in range(w.shape[0])]
+    )
+
+
+NUM_TOKENS = [1, 2, 64, 121]
+# (M, intermediate_size N, hidden_size K, num_experts E, topk)
+MoE_CONFIGS = [
+    (256, 512, 8, 2),
+    (256, 512, 8, 4),
+    (512, 256, 8, 2),
+    (512, 256, 8, 4),
+    (512, 512, 8, 2),
+    (512, 512, 8, 4),
+    (768, 2048, 8, 2),
+    (768, 2048, 8, 4),
+    (768, 2048, 128, 8),
+]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("M", NUM_TOKENS)
+@pytest.mark.parametrize("N,K,E,topk", MoE_CONFIGS)
+@pytest.mark.parametrize("seed", SEEDS)
+def test_w8a16_block_fp8_cpu_fused_moe(M, N, K, E, topk, seed):
+    """Test fused_experts_cpu FP8 W8A16 against dequantised torch reference."""
+    set_random_seed(seed)
+
+    a = torch.randn(M, K, dtype=torch.bfloat16) / math.sqrt(K)
+    w1, w2, w1_s, w2_s = _make_fp8_moe_weights(E, N, K, BLOCK_SIZE)
+
+    score = torch.randn(M, E, dtype=torch.bfloat16)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.to(torch.int32)
+
+    ref_out = ref_w8a16_block_fp8_moe(
+        a,
+        w1,
+        w2,
+        w1_s,
+        w2_s,
+        topk_weight,
+        topk_ids,
+        BLOCK_SIZE,
+    )
+
+    pw1, pw2 = _prepack_experts(w1), _prepack_experts(w2)
+
+    # Test inplace=False against reference
+    out = ops.fused_experts_cpu(
+        a.clone(),
+        pw1,
+        pw2,
+        topk_weight,
+        topk_ids,
+        False,
+        ops.CPUQuantMethod.FP8_W8A16,
+        w1_s,
+        w2_s,
+        None,
+        None,
+        BLOCK_SIZE,
+        True,
+    )
+    torch.testing.assert_close(
+        ref_out.bfloat16(),
+        out,
+        atol=FP8_W8A16_ATOL,
+        rtol=FP8_W8A16_RTOL,
+    )
+
+    # Test inplace=True produces identical output
+    out_inplace = ops.fused_experts_cpu(
+        a.clone(),
+        pw1,
+        pw2,
+        topk_weight,
+        topk_ids,
+        True,
+        ops.CPUQuantMethod.FP8_W8A16,
+        w1_s,
+        w2_s,
+        None,
+        None,
+        BLOCK_SIZE,
+        True,
+    )
+    torch.testing.assert_close(out_inplace, out, atol=0, rtol=0)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index aefc35324d86..8ed7757f6553 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -28,6 +28,25 @@
     and has_flashinfer()
 )
 
+# ROCm platform and dependencies
+ROCM_AVAILABLE = current_platform.is_rocm()
+ROCM_TRITON_KERNELS_AVAILABLE = False
+ROCM_AITER_AVAILABLE = False
+ROCM_GFX950 = False
+
+if ROCM_AVAILABLE:
+    from vllm._aiter_ops import rocm_aiter_ops
+    from vllm.platforms.rocm import on_gfx950
+    from vllm.utils.import_utils import has_triton_kernels
+
+    ROCM_TRITON_KERNELS_AVAILABLE = has_triton_kernels()
+    ROCM_GFX950 = on_gfx950()
+    ROCM_AITER_AVAILABLE = rocm_aiter_ops.is_enabled()
+
+    if ROCM_AITER_AVAILABLE:
+        from aiter.ops.triton.moe.quant_moe import upcast_from_mxfp
+        from aiter.ops.triton.quant import dynamic_mxfp4_quant
+
 if TRTLLM_GEN_MXFP4_AVAILABLE:
     from flashinfer import (
         fp4_quantize,
@@ -111,6 +130,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
 
 def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = None):
     # Note we add an extra bias of 1 to the linear layer
+    # Uses chunked layout: first half is gate, second half is up
     x_glu, x_linear = torch.chunk(x, 2, dim=-1)
     if limit is not None:
         x_glu = x_glu.clamp(max=limit)
@@ -119,6 +139,16 @@ def swiglu(x, alpha: float = 1.702, beta: float = 1.0, limit: float | None = Non
     return out_glu * (x_linear + beta)
 
 
+def swigluoai(x, alpha: float = 1.702, limit: float = 7.0):
+    # OAI swiglu uses interleaved layout: gate/up alternating
+    # See SwigluOAIAndMul in vllm/model_executor/layers/activation.py
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    return (up + 1) * glu
+
+
 fp4_lookup_table = [0, 0.5, 1, 1.5, 2, 3, 4, 6, -0, -0.5, -1, -1.5, -2, -3, -4, -6]
 
 
@@ -168,8 +198,20 @@ def reference_moe(
     beta,
     limit,
     act_type,
-    is_gated,
+    activation: str = "swiglu",
+    use_interleaved_layout: bool = False,
 ):
+    """
+    Reference MoE implementation for accuracy testing.
+
+    Args:
+        activation: One of "swiglu", "silu", "relu2". Controls the activation
+            function used after the first MLP.
+        use_interleaved_layout: If True, uses interleaved gate/up layout
+            (gate=x[..., ::2], up=x[..., 1::2]) as used by SWIGLUOAI.
+            If False, uses chunked layout (gate, up = chunk(x, 2)) as used
+            by standard swiglu/silu.
+    """
     # renormalize routing
     experts = torch.topk(roouting_logits, k=topk, dim=-1, sorted=True)
     expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
@@ -179,12 +221,21 @@ def reference_moe(
     mlp1_weight = w13[expert_indices, ...]
     mlp1_bias = bias13[expert_indices, ...]
     t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
-    if is_gated:
-        t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
-    else:
+
+    # Apply activation
+    if activation in ("swiglu", "silu"):
+        if use_interleaved_layout:
+            # SWIGLUOAI: interleaved gate/up layout
+            t = swigluoai(t, alpha=alpha, limit=limit)
+        else:
+            # Standard swiglu/silu: chunked layout
+            t = swiglu(t, alpha=alpha, beta=beta, limit=limit)
+    elif activation == "relu2":
         # RELU2_NO_MUL: relu(x)^2
         t = torch.relu(t)
         t = t * t
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
 
     if act_type == "mxfp8":
         t_quantized, t_scale = mxfp8_quantize(
@@ -585,7 +636,8 @@ def test_trtllm_gen_mxfp4_fused_moe(
             beta,
             limit,
             act_type,
-            is_gated=True,
+            activation="swiglu",
+            use_interleaved_layout=False,
         )
         ref_result[start_idx:end_idx].copy_(chunk_result)
 
@@ -722,7 +774,8 @@ def test_flashinfer_cutlass_mxfp4_fused_moe(
         beta,
         limit,
         "bf16",
-        is_gated=True,
+        activation="swiglu",
+        use_interleaved_layout=False,
     )
 
     from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
@@ -908,7 +961,8 @@ def dequant_mxfp4_batches(mat_fp4: torch.Tensor, scale_tensor: torch.Tensor):
         beta,
         limit,
         "mxfp8",
-        is_gated=True,
+        activation="swiglu",
+        use_interleaved_layout=False,
     )
 
     # Prepare inputs for FlashInfer CUTLASS fused MoE
@@ -1080,7 +1134,8 @@ def test_trtllm_gen_mxfp8_block_scale_moe(
         beta=0.0,
         limit=None,
         act_type="mxfp8",
-        is_gated=is_gated,
+        activation="swiglu" if is_gated else "relu2",
+        use_interleaved_layout=False,
     )
 
     # Shuffle weights/scales with the same indexed layout used by TRTLLM kernels.
@@ -1150,3 +1205,328 @@ def test_trtllm_gen_mxfp8_block_scale_moe(
 
     # Block-scale MXFP8 kernels are approximate; require majority close.
     check_accuracy(ref, out, atol=0.1, rtol=0.85, percent=0.8)
+
+
+# -----------------------------------------------------------------------------
+# ROCm Oracle-based kernel execution tests
+# -----------------------------------------------------------------------------
+# TODO: Further tighten the accuracy threshold.
+# - More accurate ref moe to include activation quantization
+# - Check aiter kernel accuracy. E.g., quant / dequant details.
+ROCM_BACKEND_CONFIGS = {
+    "TRITON": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.3,
+        "percent": 0.95,
+        "requires_aiter": False,
+        "requires_gfx950": False,
+    },
+    "TRITON_UNFUSED": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.3,
+        "percent": 0.95,
+        "requires_aiter": False,
+        "requires_gfx950": False,
+    },
+    "AITER_MXFP4_BF16": {
+        "activation": "SILU",
+        "rtol": 1.0,
+        "percent": 0.7,
+        "requires_aiter": True,
+        "requires_gfx950": True,
+    },
+    "AITER_MXFP4_FP8": {
+        "activation": "SWIGLUOAI",
+        "rtol": 0.5,
+        "percent": 0.9,
+        "requires_aiter": True,
+        "requires_gfx950": True,
+    },
+}
+
+
+@pytest.mark.parametrize("backend_name", list(ROCM_BACKEND_CONFIGS.keys()))
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("num_tokens,hidden_size,intermediate_size", [(16, 256, 256)])
+@pytest.mark.skipif(
+    not ROCM_AVAILABLE,
+    reason="ROCm is required for this test",
+)
+@torch.inference_mode()
+def test_rocm_mxfp4_moe_oracle(
+    backend_name: str,
+    topk: int,
+    num_experts: int,
+    num_tokens: int,
+    hidden_size: int,
+    intermediate_size: int,
+):
+    """
+    Test ROCm MXFP4 MoE using oracle functions.
+
+    This test validates that the oracle functions work end-to-end:
+    - select_mxfp4_moe_backend() selects a valid backend
+    - convert_to_mxfp4_moe_kernel_format() converts weights without error
+    - make_mxfp4_moe_quant_config() builds a valid quant config
+    - make_mxfp4_moe_kernel() creates a kernel that runs without error
+    - The kernel output is within accuracy tolerance of reference
+    """
+    config = ROCM_BACKEND_CONFIGS[backend_name]
+
+    # Check platform requirements
+    if not ROCM_TRITON_KERNELS_AVAILABLE:
+        pytest.skip("triton_kernels required for quantization")
+    if config["requires_aiter"] and not ROCM_AITER_AVAILABLE:
+        pytest.skip(f"Backend {backend_name} requires AITER")
+    if config["requires_gfx950"] and not ROCM_GFX950:
+        pytest.skip(f"Backend {backend_name} requires GFX950")
+
+    from vllm.config import VllmConfig, set_current_vllm_config
+    from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+    from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+        Mxfp4MoeBackend,
+        backend_to_kernel_cls,
+        convert_to_mxfp4_moe_kernel_format,
+        make_mxfp4_moe_kernel,
+        make_mxfp4_moe_quant_config,
+    )
+    from vllm.v1.worker.workspace import init_workspace_manager
+
+    # Initialize workspace manager (needed for modular kernels)
+    init_workspace_manager(torch.accelerator.current_device_index())
+
+    # Map string to enum
+    backend = Mxfp4MoeBackend[backend_name]
+
+    # Get experts class from oracle
+    experts_cls_list = backend_to_kernel_cls(backend)
+    if experts_cls_list is None or len(experts_cls_list) == 0:
+        pytest.skip(f"Backend {backend_name} not available")
+
+    # Use first experts class
+    experts_cls = experts_cls_list[0]
+
+    torch.manual_seed(42)
+    dtype = torch.bfloat16
+    device = "cuda:0"
+
+    # Create MoE config with Renormalize routing (required by monolithic kernels)
+    from vllm.model_executor.layers.fused_moe import FusedMoEConfig
+    from vllm.model_executor.layers.fused_moe.config import (
+        FusedMoEParallelConfig,
+        RoutingMethodType,
+    )
+
+    moe_config = FusedMoEConfig(
+        num_experts=num_experts,
+        experts_per_token=topk,
+        hidden_dim=hidden_size,
+        intermediate_size_per_partition=intermediate_size,
+        num_local_experts=num_experts,
+        num_logical_experts=num_experts,
+        moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+        activation=MoEActivation[config["activation"]],
+        in_dtype=dtype,
+        device="cuda",
+        routing_method=RoutingMethodType.Renormalize,
+    )
+
+    # Create float weights in checkpoint format:
+    # w13: [num_experts, 2*intermediate_size, hidden_size]
+    # w2: [num_experts, hidden_size, intermediate_size]
+    w13_float = torch.randn(
+        num_experts, 2 * intermediate_size, hidden_size, dtype=dtype, device=device
+    )
+    w2_float = torch.randn(
+        num_experts, hidden_size, intermediate_size, dtype=dtype, device=device
+    )
+
+    # dynamic_mxfp4_quant expects 2D input, so reshape 3D weights
+    # w13: [E, 2*I, H] -> [E*2*I, H] -> quantize -> [E, 2*I, H//2]
+    # w2: [E, H, I] -> [E*H, I] -> quantize -> [E, H, I//2]
+    w13_2d = w13_float.reshape(-1, hidden_size)
+    w13_quant_2d, w13_scale_2d = dynamic_mxfp4_quant(w13_2d)
+    w13_quant = w13_quant_2d.reshape(num_experts, 2 * intermediate_size, -1)
+    w13_scale = w13_scale_2d.reshape(num_experts, 2 * intermediate_size, -1)
+
+    w2_2d = w2_float.reshape(-1, intermediate_size)
+    w2_quant_2d, w2_scale_2d = dynamic_mxfp4_quant(w2_2d)
+    w2_quant = w2_quant_2d.reshape(num_experts, hidden_size, -1)
+    w2_scale = w2_scale_2d.reshape(num_experts, hidden_size, -1)
+
+    w13_bias = torch.randn(
+        num_experts, 2 * intermediate_size, dtype=dtype, device=device
+    )
+    w2_bias = torch.randn(num_experts, hidden_size, dtype=dtype, device=device)
+
+    # Create static input scales for W4A8 backend (AITER_MXFP4_FP8)
+    w13_input_scale: torch.Tensor | None = None
+    w2_input_scale: torch.Tensor | None = None
+    if backend_name == "AITER_MXFP4_FP8":
+        # Static FP8 scales: one scale per expert
+        w13_input_scale = torch.ones(num_experts, dtype=torch.float32, device=device)
+        w2_input_scale = torch.ones(num_experts, dtype=torch.float32, device=device)
+
+    # Create mock layer for oracle functions
+    class MockLayer:
+        w13_weight: torch.Tensor
+        w2_weight: torch.Tensor
+        w13_weight_scale: torch.Tensor
+        w2_weight_scale: torch.Tensor
+        w13_input_scale: torch.Tensor | None
+        w2_input_scale: torch.Tensor | None
+
+    layer = MockLayer()
+    layer.w13_weight = w13_quant
+    layer.w2_weight = w2_quant
+    layer.w13_weight_scale = w13_scale
+    layer.w2_weight_scale = w2_scale
+    layer.w13_input_scale = w13_input_scale
+    layer.w2_input_scale = w2_input_scale
+
+    # Convert weights using oracle
+    w13_conv, w2_conv, w13_scale_conv, w2_scale_conv, w13_bias_conv, w2_bias_conv = (
+        convert_to_mxfp4_moe_kernel_format(
+            mxfp4_backend=backend,
+            layer=layer,  # type: ignore[arg-type]
+            w13_weight=w13_quant,
+            w2_weight=w2_quant,
+            w13_weight_scale=w13_scale,
+            w2_weight_scale=w2_scale,
+            w13_bias=w13_bias,
+            w2_bias=w2_bias,
+        )
+    )
+
+    # Build quant config using oracle
+    quant_config = make_mxfp4_moe_quant_config(
+        mxfp4_backend=backend,
+        w1_scale=w13_scale_conv,
+        w2_scale=w2_scale_conv,
+        w1_bias=w13_bias_conv,
+        w2_bias=w2_bias_conv,
+        a1_scale=w13_input_scale,
+        a2_scale=w2_input_scale,
+    )
+
+    # Select activation based on backend
+    activation_name = str(config["activation"])
+    activation = MoEActivation[activation_name]
+
+    # Build kernel using oracle
+    assert quant_config is not None, "Failed to create quant config"
+    with set_current_vllm_config(VllmConfig()):
+        kernel = make_mxfp4_moe_kernel(
+            moe_quant_config=quant_config,
+            moe_config=moe_config,
+            mxfp4_backend=backend,
+            experts_cls=experts_cls,
+            routing_tables=None,
+            shared_experts=None,
+        )
+
+        # Create inputs
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype, device=device)
+        router_logits = torch.randn(
+            num_tokens, num_experts, dtype=torch.float32, device=device
+        )
+        topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+        topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+        # Run kernel - use appropriate method based on impl type
+        if kernel.is_monolithic:
+            # Monolithic impl uses router_logits
+            out = kernel.apply_monolithic(
+                hidden_states=x,
+                w1=w13_conv,
+                w2=w2_conv,
+                router_logits=router_logits,
+                activation=activation,
+                global_num_experts=num_experts,
+                expert_map=None,
+                apply_router_weight_on_input=False,
+            )
+        else:
+            # Modular impl uses topk_weights and topk_ids
+            out = kernel.apply(
+                hidden_states=x,
+                w1=w13_conv,
+                w2=w2_conv,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                activation=activation,
+                global_num_experts=num_experts,
+                expert_map=None,
+                apply_router_weight_on_input=False,
+            )
+
+    # Verify output is valid (no NaN/Inf) and has expected shape
+    assert out.shape == (num_tokens, hidden_size), f"Unexpected shape: {out.shape}"
+    assert not torch.any(torch.isnan(out)), "Output contains NaN"
+    assert not torch.any(torch.isinf(out)), "Output contains Inf"
+
+    # Verify output has reasonable magnitude (not all zeros)
+    assert out.abs().max() > 0.01, "Output is effectively zero"
+
+    # Dequantize weights for reference computation
+    w13_dq = upcast_from_mxfp(
+        w13_quant.view(torch.uint8), w13_scale, torch.bfloat16, axis=-1
+    )
+    w2_dq = upcast_from_mxfp(
+        w2_quant.view(torch.uint8), w2_scale, torch.bfloat16, axis=-1
+    )
+
+    # Determine activation type and layout
+    # SWIGLUOAI uses interleaved layout (gate/up alternating)
+    # SILU uses chunked layout (first half gate, second half up)
+    use_interleaved = activation == MoEActivation.SWIGLUOAI
+    if activation in [MoEActivation.SWIGLUOAI, MoEActivation.SILU]:
+        act_name = "swiglu"
+    else:
+        act_name = "relu2"
+
+    ref = reference_moe(
+        router_logits,
+        topk,
+        num_experts,
+        x.to(torch.float32),
+        w13_dq.to(torch.float32),
+        w13_bias.to(torch.float32),
+        w2_dq.to(torch.float32),
+        w2_bias.to(torch.float32),
+        alpha=1.702 if activation == MoEActivation.SWIGLUOAI else 1.0,
+        beta=1.0 if activation == MoEActivation.SWIGLUOAI else 0.0,
+        limit=7.0 if activation == MoEActivation.SWIGLUOAI else None,
+        act_type="bf16",
+        activation=act_name,
+        use_interleaved_layout=use_interleaved,
+    )
+
+    # Compute and print accuracy statistics
+    diff = (ref.float() - out.float()).abs()
+    rel_diff = diff / (ref.float().abs() + 1e-6)
+
+    print(f"\n[{backend_name}] Accuracy statistics:")
+    print(
+        f"  Reference: min={ref.min():.4f}, max={ref.max():.4f}, mean={ref.mean():.4f}"
+    )
+    print(
+        f"  Output:    min={out.min():.4f}, max={out.max():.4f}, mean={out.mean():.4f}"
+    )
+    print(
+        f"  Abs diff:  min={diff.min():.4f}, max={diff.max():.4f}, "
+        f"mean={diff.mean():.4f}"
+    )
+    print(
+        f"  Rel diff:  min={rel_diff.min():.4f}, max={rel_diff.max():.4f}, "
+        f"mean={rel_diff.mean():.4f}"
+    )
+
+    # Check what percentage of values are within various tolerances
+    for rtol in [0.1, 0.5, 1.0, 2.0]:
+        within_tol = (diff <= rtol * out.float().abs()).float().mean()
+        print(f"  Within rtol={rtol}: {within_tol * 100:.1f}%")
+
+    # Check accuracy using per-backend thresholds
+    check_accuracy(ref, out, atol=0.1, rtol=config["rtol"], percent=config["percent"])
diff --git a/tests/kernels/moe/test_topk_softplus_sqrt.py b/tests/kernels/moe/test_topk_softplus_sqrt.py
index 7f5aacb383db..1b68213fafef 100644
--- a/tests/kernels/moe/test_topk_softplus_sqrt.py
+++ b/tests/kernels/moe/test_topk_softplus_sqrt.py
@@ -70,7 +70,8 @@ def test_sqrtsoftplus_bias_uses_deepseek_v4_routing_method():
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+    not current_platform.is_cuda_alike(),
+    reason="This test is skipped on non-CUDA platform.",
 )
 @pytest.mark.parametrize("num_tokens", [1, 33, 128])
 @pytest.mark.parametrize("hidden_size", [1024, 2048])
@@ -125,7 +126,8 @@ def test_fused_topk_softplus_sqrt(
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+    not current_platform.is_cuda_alike(),
+    reason="This test is skipped on non-CUDA platform.",
 )
 @pytest.mark.parametrize("num_tokens", [1, 33, 128])
 @pytest.mark.parametrize("hidden_size", [1024, 2048])
diff --git a/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py b/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
new file mode 100644
index 000000000000..3154e2cb98bb
--- /dev/null
+++ b/tests/kernels/quantization/test_cpu_fp8_scaled_mm.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for CPU FP8 W8A16 block-scaled GEMM kernel (fp8_scaled_mm_cpu).
+
+Run `pytest tests/kernels/quantization/test_cpu_fp8_scaled_mm.py -v`.
+"""
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+if not ops._supports_cpu_fp8_w8a16:
+    pytest.skip("fp8_scaled_mm_cpu op not available", allow_module_level=True)
+
+BLOCK_SIZE = [128, 128]
+
+
+def cdiv(a: int, b: int) -> int:
+    return -(a // -b)
+
+
+def quantize_weight_block_fp8(
+    weight: torch.Tensor,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Quantize weight [N, K] to FP8 with block scales.
+
+    Returns:
+        fp8_weight: [N, K] float8_e4m3fn
+        scales: [n_tiles, k_tiles] float32
+    """
+    N, K = weight.shape
+    block_n, block_k = block_size
+    fp8_max = torch.finfo(torch.float8_e4m3fn).max
+
+    n_tiles = cdiv(N, block_n)
+    k_tiles = cdiv(K, block_k)
+
+    # Pad for even blocking
+    pad_N = (block_n - (N % block_n)) % block_n
+    pad_K = (block_k - (K % block_k)) % block_k
+    if pad_N > 0 or pad_K > 0:
+        weight = torch.nn.functional.pad(weight, (0, pad_K, 0, pad_N))
+
+    # Reshape into blocks
+    w_blocks = weight.view(n_tiles, block_n, k_tiles, block_k)
+    w_blocks = w_blocks.permute(0, 2, 1, 3).contiguous()
+
+    # Per-block scale
+    abs_max = w_blocks.abs().amax(dim=(-2, -1), keepdim=True)
+    scales = abs_max / fp8_max
+    scales = torch.where(scales == 0, torch.ones_like(scales), scales)
+
+    # Quantize
+    q_fp8 = (w_blocks / scales).clamp(-fp8_max, fp8_max).to(torch.float8_e4m3fn)
+
+    # Reshape back
+    fp8_weight = (
+        q_fp8.permute(0, 2, 1, 3)
+        .contiguous()
+        .view(N + pad_N, K + pad_K)[:N, :K]
+        .contiguous()
+    )
+
+    scales = scales.view(n_tiles, k_tiles)
+    return fp8_weight, scales
+
+
+def dequant_weight_block_fp8(
+    fp8_weight: torch.Tensor,
+    scales: torch.Tensor,
+    block_size: list[int],
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    """Dequantize FP8 weight back to float for reference computation."""
+    N, K = fp8_weight.shape
+    block_n, block_k = block_size
+    n_tiles, k_tiles = scales.shape
+
+    pad_N = (block_n - (N % block_n)) % block_n
+    pad_K = (block_k - (K % block_k)) % block_k
+    if pad_N > 0 or pad_K > 0:
+        fp8_padded = torch.nn.functional.pad(fp8_weight.float(), (0, pad_K, 0, pad_N))
+    else:
+        fp8_padded = fp8_weight.float()
+
+    w_blocks = fp8_padded.view(n_tiles, block_n, k_tiles, block_k)
+    w_blocks = w_blocks.permute(0, 2, 1, 3).contiguous()
+    dq = w_blocks * scales.view(n_tiles, k_tiles, 1, 1)
+    dq = dq.permute(0, 2, 1, 3).contiguous().view(N + pad_N, K + pad_K)
+    return dq[:N, :K].to(out_dtype)
+
+
+def ref_fp8_block_scaled_mm(
+    x: torch.Tensor,
+    fp8_weight: torch.Tensor,
+    scales: torch.Tensor,
+    block_size: list[int],
+    bias: torch.Tensor | None,
+    out_dtype: torch.dtype,
+) -> torch.Tensor:
+    """Reference: dequant FP8→float32, matmul in float32, cast to out_dtype."""
+    w_dq = dequant_weight_block_fp8(fp8_weight, scales, block_size, torch.float32)
+    out = torch.mm(x.float(), w_dq.t())
+    if bias is not None:
+        out = out + bias.float()
+    return out.to(out_dtype)
+
+
+# ---------------------------------------------------------------------------
+# Test parameters
+# ---------------------------------------------------------------------------
+M_SIZES = [1, 4, 16, 64, 128]
+# (N, K) — weight shape is [N, K], output has N columns.
+NK_SIZES = [
+    (128, 256),
+    (256, 512),
+    (512, 1024),
+    (1024, 2048),
+    (5120, 5120),
+    (17408, 5120),
+    (5120, 17408),
+]
+
+
+@pytest.mark.parametrize("M", M_SIZES)
+@pytest.mark.parametrize("N,K", NK_SIZES)
+@pytest.mark.parametrize("use_bias", [False, True])
+def test_cpu_fp8_scaled_mm(M: int, N: int, K: int, use_bias: bool):
+    """fp8_scaled_mm_cpu correctness against float reference."""
+    torch.manual_seed(42)
+    out_dtype = torch.bfloat16
+    block_size = BLOCK_SIZE
+
+    x = torch.randn(M, K, dtype=out_dtype) / (K**0.5)
+    w_f32 = torch.randn(N, K, dtype=torch.float32) / (K**0.5)
+    fp8_weight, scales = quantize_weight_block_fp8(w_f32, block_size)
+
+    bias = torch.randn(N, dtype=torch.float32) * 0.1 if use_bias else None
+
+    ref_out = ref_fp8_block_scaled_mm(
+        x, fp8_weight, scales, block_size, bias, out_dtype
+    )
+
+    packed_weight = torch.ops._C.convert_weight_packed(fp8_weight)
+    kernel_out = ops.fp8_scaled_mm_cpu(
+        x,
+        packed_weight,
+        scales,
+        block_size,
+        bias,
+        out_dtype,
+        True,
+    )
+
+    assert kernel_out.dtype == out_dtype
+    torch.testing.assert_close(kernel_out, ref_out, rtol=0.02, atol=0.01)
diff --git a/tests/kernels/test_awq_int4_to_int8.py b/tests/kernels/test_awq_int4_to_int8.py
index 829c0d2277a7..f94214785d22 100644
--- a/tests/kernels/test_awq_int4_to_int8.py
+++ b/tests/kernels/test_awq_int4_to_int8.py
@@ -20,7 +20,11 @@
 import pytest
 import torch
 
-from vllm._custom_ops import _supports_cpu_w4a8_int8
+from vllm._custom_ops import (
+    CPUQuantAlgo,
+    convert_weight_packed_scale_zp,
+    int4_scaled_mm_cpu,
+)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     pack_cols,
 )
@@ -29,11 +33,6 @@
 if not current_platform.is_cpu():
     pytest.skip("skipping CPU-only tests", allow_module_level=True)
 
-requires_cpu_w4a8_int8 = pytest.mark.skipif(
-    not _supports_cpu_w4a8_int8,
-    reason="Requires vLLM CPU build with SGLang INT4 W4A8 kernels",
-)
-
 
 def make_awq_checkpoint_data(K, N, group_size, seed=42):
     """Create synthetic AWQ checkpoint data in packed int32 format.
@@ -87,7 +86,6 @@ def make_awq_checkpoint_data(K, N, group_size, seed=42):
 class TestConvertWeightPackedScaleZp:
     """Tests for convert_weight_packed_scale_zp weightpacking."""
 
-    @requires_cpu_w4a8_int8
     @pytest.mark.parametrize(
         "K,N,group_size",
         [
@@ -102,8 +100,11 @@ def test_packing_output_shapes(self, K, N, group_size):
             K, N, group_size
         )
 
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_qweight, packed_qzeros, scales
+        blocked_w, blocked_zp, blocked_s = convert_weight_packed_scale_zp(
+            packed_qweight,
+            packed_qzeros,
+            scales,
+            CPUQuantAlgo.AWQ,
         )
 
         block_n = 32
@@ -129,7 +130,6 @@ def test_packing_output_shapes(self, K, N, group_size):
 class TestInt4ScaledMmCpu:
     """Tests for int4_scaled_mm_cpu GEMM kernel."""
 
-    @requires_cpu_w4a8_int8
     @pytest.mark.parametrize(
         "M,K,N,group_size",
         [
@@ -146,12 +146,15 @@ def test_gemm_vs_float_reference(self, M, K, N, group_size):
             make_awq_checkpoint_data(K, N, group_size)
         )
 
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_qweight, packed_qzeros, scales
+        blocked_w, blocked_zp, blocked_s = convert_weight_packed_scale_zp(
+            packed_qweight,
+            packed_qzeros,
+            scales,
+            CPUQuantAlgo.AWQ,
         )
 
         x = torch.randn(M, K, dtype=torch.bfloat16)
-        out = torch.ops._C.int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, None)
+        out = int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, None)
 
         ref_out = torch.mm(x.float(), float_ref)
 
@@ -169,7 +172,6 @@ def test_gemm_vs_float_reference(self, M, K, N, group_size):
         )
         print(f"  [PASS] INT4 GEMM correct: M={M}, K={K}, N={N}")
 
-    @requires_cpu_w4a8_int8
     @pytest.mark.parametrize("M", [1, 8, 32])
     def test_gemm_with_bias(self, M):
         """INT4 W4A8 GEMM with bias should match reference."""
@@ -178,14 +180,17 @@ def test_gemm_with_bias(self, M):
             make_awq_checkpoint_data(K, N, group_size)
         )
 
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_qweight, packed_qzeros, scales
+        blocked_w, blocked_zp, blocked_s = convert_weight_packed_scale_zp(
+            packed_qweight,
+            packed_qzeros,
+            scales,
+            CPUQuantAlgo.AWQ,
         )
 
         bias = torch.randn(N, dtype=torch.float32)
         x = torch.randn(M, K, dtype=torch.bfloat16)
 
-        out = torch.ops._C.int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, bias)
+        out = int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, bias)
 
         ref_out = torch.mm(x.float(), float_ref) + bias
         abs_diff = (out.float() - ref_out).abs()
@@ -197,7 +202,6 @@ def test_gemm_with_bias(self, M):
         )
         print(f"  [PASS] INT4 GEMM with bias: M={M}")
 
-    @requires_cpu_w4a8_int8
     def test_gemm_3d_input(self):
         """apply() reshapes 3D input [B, S, K] -> [B*S, K] -> back to 3D."""
         K, N, group_size = 256, 128, 128
@@ -205,17 +209,18 @@ def test_gemm_3d_input(self):
             make_awq_checkpoint_data(K, N, group_size)
         )
 
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_qweight, packed_qzeros, scales
+        blocked_w, blocked_zp, blocked_s = convert_weight_packed_scale_zp(
+            packed_qweight,
+            packed_qzeros,
+            scales,
+            CPUQuantAlgo.AWQ,
         )
 
         B, S = 2, 8
         x_3d = torch.randn(B, S, K, dtype=torch.bfloat16)
         x_2d = x_3d.reshape(-1, K)
 
-        out_2d = torch.ops._C.int4_scaled_mm_cpu(
-            x_2d, blocked_w, blocked_zp, blocked_s, None
-        )
+        out_2d = int4_scaled_mm_cpu(x_2d, blocked_w, blocked_zp, blocked_s, None)
         out_3d = out_2d.reshape(B, S, N)
 
         ref_out = torch.mm(x_2d.float(), float_ref).reshape(B, S, N)
@@ -229,7 +234,6 @@ def test_gemm_3d_input(self):
         assert mean_rel < 0.05, f"Mean relative error {mean_rel:.4f} for 3D exceeds 5%"
         print(f"  [PASS] 3D input [{B},{S},{K}] -> output [{B},{S},{N}]")
 
-    @requires_cpu_w4a8_int8
     def test_gemm_fp16_input(self):
         """INT4 GEMM should also work with fp16 input."""
         K, N, group_size, M = 256, 256, 128, 8
@@ -237,12 +241,15 @@ def test_gemm_fp16_input(self):
             make_awq_checkpoint_data(K, N, group_size)
         )
 
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_qweight, packed_qzeros, scales
+        blocked_w, blocked_zp, blocked_s = convert_weight_packed_scale_zp(
+            packed_qweight,
+            packed_qzeros,
+            scales,
+            CPUQuantAlgo.AWQ,
         )
 
         x = torch.randn(M, K, dtype=torch.float16)
-        out = torch.ops._C.int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, None)
+        out = int4_scaled_mm_cpu(x, blocked_w, blocked_zp, blocked_s, None)
 
         ref_out = torch.mm(x.float(), float_ref)
         abs_diff = (out.float() - ref_out).abs()
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 169ddbf7ce5c..bfe796d66c3f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -62,9 +62,7 @@ def dist_init():
 
     temp_file = tempfile.mkstemp()[1]
 
-    backend = "nccl"
-    if current_platform.is_cpu() or current_platform.is_tpu():
-        backend = "gloo"
+    backend = "gloo" if current_platform.is_tpu() else current_platform.dist_backend
 
     with ensure_current_vllm_config():
         init_distributed_environment(
@@ -83,9 +81,7 @@ def dist_init():
 def dist_init_torch_only():
     if torch.distributed.is_initialized():
         return
-    backend = "nccl"
-    if current_platform.is_cpu():
-        backend = "gloo"
+    backend = current_platform.dist_backend
 
     temp_file = tempfile.mkstemp()[1]
     torch.distributed.init_process_group(
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 8adc20865755..e50d7d5aacfe 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -647,6 +647,7 @@ def _get_shard_slice(shard_size):
         rank=local_rank,
         local_rank=local_rank,
         distributed_init_method=init_method,
+        backend=current_platform.dist_backend,
     )
     with ensure_current_vllm_config():
         initialize_model_parallel(world_size, 1)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index a0028687a32f..c366b2cf2976 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -57,7 +57,11 @@
 }
 
 pytestmark = pytest.mark.skipif(
-    not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
+    not (
+        current_platform.is_cuda_alike()
+        or current_platform.is_cpu()
+        or current_platform.is_xpu()
+    ),
     reason="Backend not supported",
 )
 
@@ -67,7 +71,7 @@
         f"{DEVICE_TYPE}:{i}"
         for i in range(1 if torch.accelerator.device_count() == 1 else 2)
     ]
-    if current_platform.is_cuda_alike()
+    if (current_platform.is_cuda_alike() or current_platform.is_xpu())
     else ["cpu"]
 )
 
@@ -96,7 +100,7 @@ def skip_cuda_with_stage_false(request):
     On cuda-like platforms, we use the same kernels for prefill and decode
     stage, and 'stage' is generally ignored, so we only need to test once.
     """
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         try:
             if hasattr(request.node, "callspec") and hasattr(
                 request.node.callspec, "params"
@@ -249,6 +253,10 @@ def check_punica_wrapper(punica_wrapper) -> bool:
         from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
 
         return type(punica_wrapper) is PunicaWrapperCPU
+    elif current_platform.is_xpu():
+        from vllm.lora.punica_wrapper.punica_xpu import PunicaWrapperXPU
+
+        return type(punica_wrapper) is PunicaWrapperXPU
     else:
         return False
 
@@ -264,7 +272,7 @@ def test_embeddings(
     # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
     # device, see: https://github.com/triton-lang/triton/issues/2925
     # Same below.
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
@@ -363,7 +371,7 @@ def create_random_embedding_layer():
 def test_lm_head_logits_processor(
     default_vllm_config, dist_init, num_loras, device, vocab_size, stage
 ) -> None:
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
@@ -480,7 +488,7 @@ def test_lm_head_logits_processor_invalid_vocab_size(
     default_vllm_config, dist_init, vocab_size, device
 ) -> None:
     """Test that LogitsProcessorWithLoRA raises ValueError for invalid vocab sizes."""
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
@@ -509,7 +517,7 @@ def test_linear_replicated(
     device,
     stage,
 ) -> None:
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     max_loras = 8
@@ -618,7 +626,7 @@ def create_random_linear_replicated_layer(idx: int = 0):
 def test_linear_parallel(
     default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
 ) -> None:
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     max_loras = 8
@@ -751,7 +759,7 @@ def create_random_linear_parallel_layer(idx: int = 0):
 def test_column_parallel_packed(
     default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
 ) -> None:
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     max_loras = 8
@@ -913,7 +921,7 @@ class FakeConfig:
 def test_merged_column_parallel_variable_slice(
     default_vllm_config, dist_init, num_loras, num_slices, device, stage
 ) -> None:
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     max_loras = 8
@@ -1600,11 +1608,15 @@ def test_get_and_maybe_dequant_weights_accepts_lora_wrappers(dist_init, wrapper_
 def test_deepseek_fused_qkv_a_proj_lora_preserves_base_forward(
     default_vllm_config, dist_init, device, stage, fully_sharded
 ):
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
-    dtype = torch.float16 if current_platform.is_cuda_alike() else torch.float32
+    dtype = (
+        torch.float16
+        if (current_platform.is_cuda_alike() or current_platform.is_xpu())
+        else torch.float32
+    )
     max_loras = 8
     lora_config = LoRAConfig(
         max_loras=max_loras,
@@ -1683,11 +1695,15 @@ def forward(self, input_):
 def test_replicated_lora_preserves_base_forward_for_subclasses(
     default_vllm_config, dist_init, device, stage
 ):
-    if current_platform.is_cuda_alike():
+    if current_platform.is_cuda_alike() or current_platform.is_xpu():
         torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
-    dtype = torch.float16 if current_platform.is_cuda_alike() else torch.float32
+    dtype = (
+        torch.float16
+        if current_platform.is_cuda_alike() or current_platform.is_xpu()
+        else torch.float32
+    )
     max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=dtype)
     punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 483235ff5129..99c823238ddb 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -184,7 +184,7 @@ def test_tp2_serialize_and_deserialize_lora(
         result = subprocess.run(
             [
                 sys.executable,
-                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
                 "--model",
                 MODEL_PATH,
                 "--lora-path",
@@ -224,6 +224,10 @@ def test_tp2_serialize_and_deserialize_lora(
         max_model_len=1024,
         tensor_parallel_size=2,
         max_loras=2,
+        # Leave headroom for LoRA adapter loading and Triton JIT
+        # compilation, which can allocate GPU memory concurrently
+        # during the first inference step.
+        gpu_memory_utilization=0.85,
     )
 
     tc_as_dict = tensorizer_config.to_serializable()
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index cfe5b46c64c5..85d079673df5 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -40,7 +40,7 @@
 DEVICE_TYPE = current_platform.device_type
 DEVICES = (
     [f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))]
-    if current_platform.is_cuda_alike()
+    if (current_platform.is_cuda_alike() or current_platform.is_xpu())
     else ["cpu"]
 )
 
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 88763551cf1b..e929fcad2896 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -20,7 +20,11 @@
 from vllm.lora.model_manager import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
-from vllm.v1.worker.gpu_worker import Worker
+
+if current_platform.is_xpu():
+    from vllm.v1.worker.xpu_worker import XPUWorker as Worker
+else:
+    from vllm.v1.worker.gpu_worker import Worker
 
 MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16
diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
index 3b950c843c56..a15a624c905d 100644
--- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
@@ -460,7 +460,7 @@ async def test_serialize_and_serve_entrypoints(tmp_path):
         result = subprocess.run(
             [
                 sys.executable,
-                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                f"{VLLM_PATH}/examples/features/tensorize_vllm_model.py",
                 "--model",
                 model_ref,
                 "serialize",
diff --git a/tests/model_executor/model_loader/test_reload.py b/tests/model_executor/model_loader/test_reload.py
index 6e3e2d63e144..cf3553bd57de 100644
--- a/tests/model_executor/model_loader/test_reload.py
+++ b/tests/model_executor/model_loader/test_reload.py
@@ -59,6 +59,34 @@ def test_reload_lifecycle():
         assert tensor.__dict__ == materialized_tensor.__dict__
 
 
+def test_materialize_layer_preserves_non_meta_tensors():
+    """Ensure that materialize_layer does not overwrite non meta tensors."""
+    layer = torch.nn.Linear(2, 3, bias=True)
+
+    # Create a non meta bias tensor and meta weight, which can happen with FP8
+    bias_values = torch.ones(3)
+    layer.bias.data.copy_(bias_values)
+    layer.weight = torch.nn.Parameter(layer.weight.data.to("meta"))
+
+    assert layer.weight.is_meta
+    assert not layer.bias.is_meta
+
+    # materialize the layer weights after the bias is initialized
+    info = LayerReloadingInfo(
+        restore_metadata=({}, {}),
+        restore_device=torch.device("cpu"),
+    )
+    materialize_layer(layer, info)
+
+    # Ensure the weight materialized off meta
+    assert not layer.weight.is_meta
+    assert layer.weight.device.type == "cpu"
+
+    # Ensure that the bias is (still) not meta and values are unchanged
+    assert not layer.bias.is_meta
+    assert torch.equal(layer.bias.data, bias_values)
+
+
 def test_model_cleanup(dist_init, default_vllm_config):
     layer = QKVParallelLinear(2, 3, 4)
     assert layer.weight.weight_loader.__self__ is layer
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index fc4f6f6b63f9..490284f43954 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -23,11 +23,7 @@
     vllm_topk_sigmoid,
     vllm_topk_softmax,
 )
-from vllm.model_executor.layers.layernorm import (
-    RMSNorm,
-    dispatch_rocm_rmsnorm_func,
-    fused_add_rms_norm,
-)
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 RMS_NORM_SUPPORTED_DTYPES = [torch.float16, torch.bfloat16]
@@ -153,26 +149,3 @@ def test_topk_sigmoid_dispatch(use_rocm_aiter: bool):
         assert topk_func == rocm_aiter_ops.topk_sigmoid
     else:
         assert topk_func == vllm_topk_sigmoid
-
-
-@pytest.mark.parametrize("add_residual", [False])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("use_rocm_aiter", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_rocm(), reason="AITER is a feature exclusive for ROCm"
-)
-def test_rms_norm_dispatch(
-    add_residual: bool, dtype: torch.dtype, use_rocm_aiter: bool
-):
-    rms_norm_func = dispatch_rocm_rmsnorm_func(dtype, use_rocm_aiter)
-
-    should_use_rocm_aiter = (
-        current_platform.is_rocm()
-        and use_rocm_aiter
-        and dtype in RMS_NORM_SUPPORTED_DTYPES
-    )
-
-    if should_use_rocm_aiter:
-        assert rms_norm_func == rocm_aiter_ops.rms_norm2d_with_add
-    else:
-        assert rms_norm_func == fused_add_rms_norm
diff --git a/tests/model_executor/test_oink_integration.py b/tests/model_executor/test_oink_integration.py
index d7f38fdd5158..2f37472b73ef 100644
--- a/tests/model_executor/test_oink_integration.py
+++ b/tests/model_executor/test_oink_integration.py
@@ -1,60 +1,97 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import multiprocessing
 import types
 
 import pytest
-import torch
-
-
-def _load_oink_ops_module():
-    # Import the module normally (vllm is installed as an editable package in CI).
-    from vllm import _oink_ops
-
-    return _oink_ops
-
 
-def test_oink_availability_checks(monkeypatch: pytest.MonkeyPatch):
-    _oink_ops = _load_oink_ops_module()
-
-    # Ensure the ops namespace exists and is mutable for tests.
-    monkeypatch.setattr(
-        torch.ops,
-        "oink",
-        types.SimpleNamespace(rmsnorm=lambda x, w, eps: x),
-        raising=False,
-    )
-
-    # Case 1: CUDA not available.
-    monkeypatch.setattr(torch.cuda, "is_available", lambda: False)
-    assert _oink_ops.is_oink_available_for_device(0) is False
-
-    # Case 2: CUDA available but < SM100.
-    monkeypatch.setattr(torch.cuda, "is_available", lambda: True)
-    monkeypatch.setattr(torch.cuda, "get_device_capability", lambda idx: (9, 0))
-    assert _oink_ops.is_oink_available_for_device(0) is False
-
-    # Case 3: CUDA available and SM100, rmsnorm op registered.
-    monkeypatch.setattr(torch.cuda, "get_device_capability", lambda idx: (10, 0))
-    assert _oink_ops.is_oink_available_for_device(0) is True
-
-    # fused op presence probe
-    assert _oink_ops.has_fused_add_rms_norm() is False
-    monkeypatch.setattr(
-        torch.ops,
-        "oink",
-        types.SimpleNamespace(
-            rmsnorm=lambda x, w, eps: x,
-            fused_add_rms_norm=lambda x, residual, w, eps: None,
+from vllm.platforms import current_platform
+
+
+def _test_oink_availability_impl(
+    device_capability: tuple[int, int],
+    has_rmsnorm: bool,
+    has_fused_add_rms_norm: bool,
+    expected_available: bool,
+    expected_fused: bool,
+) -> None:
+    """Test OINK support detection with mocked state."""
+    import torch
+
+    from vllm import platforms
+
+    # Mock device capability (class method, override on class)
+    dc = platforms.interface.DeviceCapability(*device_capability)
+    platforms.current_platform.__class__.get_device_capability = lambda device_id=0: dc
+
+    # Mock oink ops
+    oink_ops = types.SimpleNamespace()
+    if has_rmsnorm:
+        oink_ops.rmsnorm = lambda x, w, eps: x
+    if has_fused_add_rms_norm:
+        oink_ops.fused_add_rms_norm = lambda x, residual, w, eps: None
+
+    torch.ops.oink = oink_ops
+
+    # Now import vllm modules with mocks in place (fresh import with mocked platform)
+    import vllm.kernels.oink_ops  # noqa: F401
+    from vllm.ir.ops import fused_add_rms_norm, rms_norm
+
+    # Verify support checks
+    assert rms_norm.impls["oink"].supported is expected_available
+    assert fused_add_rms_norm.impls["oink"].supported is expected_fused
+
+
+@pytest.mark.parametrize(
+    "device_capability,has_rmsnorm,has_fused_add_rms_norm,expected_available,expected_fused",
+    [
+        # Case 1: < SM100, ops not supported
+        ((9, 0), True, False, False, False),
+        # Case 2: CUDA available and SM100, rmsnorm op registered
+        ((10, 0), True, False, True, False),
+        # Case 3: SM100 with both rmsnorm and fused_add_rms_norm
+        ((10, 0), True, True, True, True),
+    ],
+)
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test on CUDA")
+def test_oink_availability_checks(
+    device_capability: tuple[int, int],
+    has_rmsnorm: bool,
+    has_fused_add_rms_norm: bool,
+    expected_available: bool,
+    expected_fused: bool,
+):
+    """Test OINK support detection with clean import state for each parameter set."""
+
+    # Use spawn to run function in fresh process with clean imports
+    # TODO migrate to spawn utility:
+    # https://github.com/vllm-project/vllm/issues/41415
+    ctx = multiprocessing.get_context("spawn")
+    process = ctx.Process(
+        target=_test_oink_availability_impl,
+        args=(
+            device_capability,
+            has_rmsnorm,
+            has_fused_add_rms_norm,
+            expected_available,
+            expected_fused,
         ),
-        raising=False,
     )
-    assert _oink_ops.has_fused_add_rms_norm() is True
+    process.start()
+    process.join()
+
+    if process.exitcode != 0:
+        raise AssertionError(
+            f"Subprocess test failed with exit code {process.exitcode}"
+        )
 
 
 def test_can_view_as_2d_stride_guard():
-    # Import the helper from the layernorm module.
-    from vllm.model_executor.layers.layernorm import _can_view_as_2d
+    # No global import
+    import torch
+
+    # Import the helper from the kernels module.
+    from vllm.kernels.oink_ops import _can_view_as_2d
 
     x = torch.zeros((2, 3, 4))
     assert _can_view_as_2d(x) is True
diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py
index 0527417d1506..d33d716c1d14 100644
--- a/tests/model_executor/test_routed_experts_capture.py
+++ b/tests/model_executor/test_routed_experts_capture.py
@@ -1,245 +1,162 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import types
-from types import SimpleNamespace
-from unittest.mock import patch
 
 import pytest
 import torch
 
-from vllm.distributed.eplb.eplb_state import EplbLayerState
-from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
-    RoutedExpertsCapturer,
-)
-from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter
-
 pytestmark = pytest.mark.cpu_test
 
-_REC_MODULE = "vllm.model_executor.layers.fused_moe.routed_experts_capturer"
-
-
-def _capturer_with_buffer(
-    *,
-    max_tokens: int = 8,
-    num_layers: int = 4,
-    num_experts_per_tok: int = 2,
-    dp_rank: int = 0,
-) -> RoutedExpertsCapturer:
-    c = RoutedExpertsCapturer()
-    c.dp_rank = dp_rank
-    c._device_buffer = torch.full(
-        (max_tokens, num_layers, num_experts_per_tok),
-        -1,
-        dtype=torch.int32,
-    )
-    return c
-
-
-class DummyRouter(BaseRouter):
-    @property
-    def routing_method_type(self) -> RoutingMethodType:
-        return RoutingMethodType.FUSED_TOPK
-
-    def _compute_routing(
-        self, hidden_states, router_logits, indices_type, *, input_ids=None
-    ):
-        topk_ids = torch.tensor([[1, 2], [3, 4]], dtype=torch.int64)
-        topk_weights = torch.ones_like(topk_ids, dtype=torch.float32)
-        return topk_weights, topk_ids
-
-    def _apply_eplb_mapping(self, topk_ids: torch.Tensor) -> torch.Tensor:
-        # Make mapping observable without requiring CUDA EPLB path.
-        return topk_ids + 10
-
-
-def _make_router() -> DummyRouter:
-    return DummyRouter(
-        top_k=2,
-        global_num_experts=16,
-        eplb_state=EplbLayerState(),
-        enable_eplb=False,
-        indices_type_getter=None,
-    )
-
-
-def test_base_router_capture_pre_eplb_mapping():
-    router = _make_router()
-    captured = []
-
-    def capture_fn(ids):
-        captured.append(ids.clone())
-
-    router.set_capture_fn(capture_fn)
-    topk_weights, topk_ids = router.select_experts(
-        hidden_states=torch.empty(1),
-        router_logits=torch.empty(1),
-    )
-
-    assert topk_weights.shape == topk_ids.shape
-    assert len(captured) == 1
-    assert torch.equal(captured[0], torch.tensor([[1, 2], [3, 4]]))
-    assert torch.equal(topk_ids, torch.tensor([[11, 12], [13, 14]]))
-
-
-def test_base_router_capture_with_eplb_enabled():
-    router = _make_router()
-    router.enable_eplb = True
-    router.eplb_state.expert_load_view = torch.zeros(32, dtype=torch.int64)
-    router.eplb_state.logical_to_physical_map = torch.arange(32).view(32, 1)
-    router.eplb_state.logical_replica_count = torch.ones(32, dtype=torch.int64)
-    router.eplb_state.should_record_tensor = torch.ones((), dtype=torch.bool)
-
-    captured = []
-
-    def capture_fn(ids):
-        captured.append(ids.clone())
-
-    router.set_capture_fn(capture_fn)
-    _, topk_ids = router.select_experts(
-        hidden_states=torch.empty(1),
-        router_logits=torch.empty(1),
-    )
-
-    assert len(captured) == 1
-    # Capture should see logical ids pre-EPLB mapping.
-    assert torch.equal(captured[0], torch.tensor([[1, 2], [3, 4]]))
-    # Our DummyRouter mapping adds +10.
-    assert torch.equal(topk_ids, torch.tensor([[11, 12], [13, 14]]))
-
-
-def test_gpu_model_runner_binds_router_capture(monkeypatch):
-    from vllm.v1.worker import gpu_model_runner as gmr
 
-    class DummyFusedMoE:
-        def __init__(self):
-            self.layer_id = 7
-            self.router = _make_router()
+def test_bind_routing_capture_to_model_sets_layer_view(monkeypatch):
+    import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer
+    import vllm.model_executor.layers.fused_moe.routed_experts_capturer as rec_mod
 
-    class DummyCapturer:
-        def __init__(self):
-            self.calls = []
+    class _DummyMoEConfig:
+        is_sequence_parallel = False
+        dp_size = 1
 
-        def capture(self, layer_id, topk_ids):
-            self.calls.append((layer_id, topk_ids))
+    class _DummyQuantMethod:
+        supports_internal_mk = True
 
-    dummy_module = DummyFusedMoE()
+    class DummyFusedMoE:
+        _routing_replay_out: torch.Tensor
 
-    # Patch the runtime import inside _bind_routed_experts_capturer.
-    import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer
+        def __init__(self, moe_layer_id):
+            self.moe_layer_id = moe_layer_id
+            self.moe_config = _DummyMoEConfig()
+            self.quant_method = _DummyQuantMethod()
 
     monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE)
 
-    dummy_self = types.SimpleNamespace(
-        compilation_config=types.SimpleNamespace(
-            static_forward_context={"dummy": dummy_module}
-        )
-    )
+    num_layers, num_tokens, top_k = 4, 8, 2
+    buffer = torch.zeros((num_layers, num_tokens, top_k), dtype=torch.int16)
 
-    capturer = DummyCapturer()
-    gmr.GPUModelRunner._bind_routed_experts_capturer(dummy_self, capturer)
+    class DummyDeviceCache:
+        def __init__(self, buf):
+            self.buffer = buf
 
-    assert dummy_module.router.capture_fn is not None
-    dummy_module.router.capture_fn(torch.tensor([[5, 6]]))
+    class DummyCapturer:
+        def get_device_cache(self):
+            return DummyDeviceCache(buffer)
 
-    assert len(capturer.calls) == 1
-    layer_id, topk_ids = capturer.calls[0]
-    assert layer_id == 7
-    assert torch.equal(topk_ids, torch.tensor([[5, 6]]))
+    monkeypatch.setattr(rec_mod, "get_global_experts_capturer", lambda: DummyCapturer())
 
+    m0 = DummyFusedMoE(moe_layer_id=0)
+    m2 = DummyFusedMoE(moe_layer_id=2)
 
-def test_gpu_model_runner_binding_stage(monkeypatch):
-    from vllm.v1.worker import gpu_model_runner as gmr
+    class DummyModel:
+        def modules(self):
+            return iter([m0, m2])
 
-    class DummyFusedMoE:
-        def __init__(self):
-            self.layer_id = 11
-            self.router = _make_router()
+    rec_mod.bind_routing_capture_to_model(DummyModel())
+
+    assert torch.equal(m0._routing_replay_out, buffer[0])
+    assert torch.equal(m2._routing_replay_out, buffer[2])
+
+
+def test_bind_routing_capture_to_model_noop_when_disabled(monkeypatch):
+    import vllm.model_executor.layers.fused_moe.routed_experts_capturer as rec_mod
 
     class DummyCapturer:
-        def __init__(self):
-            self.calls = []
+        def get_device_cache(self):
+            return None
 
-        def capture(self, layer_id, topk_ids):
-            self.calls.append((layer_id, topk_ids))
+    monkeypatch.setattr(rec_mod, "get_global_experts_capturer", lambda: DummyCapturer())
 
-    dummy_module = DummyFusedMoE()
+    class DummyModel:
+        def modules(self):
+            return iter([])
 
-    import vllm.model_executor.layers.fused_moe.layer as fused_moe_layer
+    rec_mod.bind_routing_capture_to_model(DummyModel())
 
-    monkeypatch.setattr(fused_moe_layer, "FusedMoE", DummyFusedMoE)
 
-    dummy_self = types.SimpleNamespace(
-        compilation_config=types.SimpleNamespace(
-            static_forward_context={"dummy": dummy_module}
+# =========================================================================
+# Tests for device-cache routing replay architecture
+# =========================================================================
+
+
+class TestRoutedExpertsDeviceCache:
+    """Tests for _RoutedExpertsDeviceCache (GPU buffer for routing data)."""
+
+    def test_allocation_shape_and_dtype(self):
+        """Device cache allocates (L, N, K) int16 buffer."""
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            _RoutedExpertsDeviceCache,
+        )
+
+        cache = _RoutedExpertsDeviceCache(
+            num_hidden_layers=40,
+            max_num_batched_tokens=8192,
+            num_experts_per_tok=8,
+            device="cpu",
+        )
+        assert cache.buffer.shape == (40, 8192, 8)
+        assert cache.buffer.dtype == torch.int16
+
+    def test_per_layer_view_is_contiguous(self):
+        """buffer[layer_id] gives contiguous (N, K) view for FlashInfer."""
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            _RoutedExpertsDeviceCache,
+        )
+
+        cache = _RoutedExpertsDeviceCache(
+            num_hidden_layers=40,
+            max_num_batched_tokens=8192,
+            num_experts_per_tok=8,
+            device="cpu",
+        )
+        layer_view = cache.buffer[0]
+        assert layer_view.is_contiguous()
+        assert layer_view.shape == (8192, 8)
+
+
+class TestRoutedExpertsHostCache:
+    """Tests for _RoutedExpertsHostCache (per-request numpy buffer)."""
+
+    def test_sentinel_initialization(self):
+        """Host cache initializes with zeros by default."""
+        import numpy as np
+
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            _RoutedExpertsHostCache,
+        )
+
+        cache = _RoutedExpertsHostCache(
+            num_hidden_layers=40,
+            num_experts_per_tok=8,
+            max_model_len=1024,
+        )
+        buf = cache.get_or_grow_buffer("req1", max_pos=100)
+        assert buf.dtype == np.int16
+        assert (buf == 0).all(), "Host cache must initialize with zeros"
+
+    def test_grow_preserves_existing_data(self):
+        """Growing the buffer preserves previously written data."""
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            _RoutedExpertsHostCache,
+        )
+
+        cache = _RoutedExpertsHostCache(
+            num_hidden_layers=40,
+            num_experts_per_tok=8,
+            max_model_len=1024,
+        )
+        buf = cache.get_or_grow_buffer("req1", max_pos=50)
+        buf[0, 0, 0] = 42
+        buf2 = cache.get_or_grow_buffer("req1", max_pos=200)
+        assert buf2[0, 0, 0] == 42, "Data lost during buffer grow"
+
+    def test_free_request_removes_buffer(self):
+        """Freeing a request removes its buffer."""
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            _RoutedExpertsHostCache,
+        )
+
+        cache = _RoutedExpertsHostCache(
+            num_hidden_layers=40,
+            num_experts_per_tok=8,
+            max_model_len=1024,
         )
-    )
-
-    # Before binding, no capture hook.
-    assert dummy_module.router.capture_fn is None
-
-    capturer = DummyCapturer()
-    gmr.GPUModelRunner._bind_routed_experts_capturer(dummy_self, capturer)
-
-    # After binding, hook should exist and be callable.
-    assert callable(dummy_module.router.capture_fn)
-    dummy_module.router.capture_fn(torch.tensor([[9, 10]]))
-    assert len(capturer.calls) == 1
-
-
-def test_routed_experts_capturer_single_dp_no_metadata():
-    """dp_metadata is None: capture writes the full topk_ids rows."""
-    capturer = _capturer_with_buffer(dp_rank=0)
-    topk = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=torch.int32)
-    ctx = SimpleNamespace(dp_metadata=None)
-    with patch(f"{_REC_MODULE}.get_forward_context", return_value=ctx):
-        capturer.capture(layer_id=0, topk_ids=topk)
-    assert torch.equal(capturer._device_buffer[:3, 0, :], topk)
-    assert capturer._device_buffer[3, 0, 0].item() == -1
-
-
-def test_routed_experts_capturer_dp_naive_concatenated_all_ranks():
-    """n == sum(num_tokens_dp): slice this rank's segment from concatenated topk."""
-    capturer = _capturer_with_buffer(dp_rank=1)
-    num_tokens_dp = torch.tensor([2, 3], dtype=torch.int32)
-    ctx = SimpleNamespace(
-        dp_metadata=SimpleNamespace(num_tokens_across_dp_cpu=num_tokens_dp)
-    )
-    # Concatenated order: rank0 rows then rank1 rows.
-    topk = torch.tensor(
-        [[0, 1], [2, 3], [10, 11], [12, 13], [14, 15]], dtype=torch.int32
-    )
-    with patch(f"{_REC_MODULE}.get_forward_context", return_value=ctx):
-        capturer.capture(layer_id=0, topk_ids=topk)
-    want = topk[2:5]
-    assert torch.equal(capturer._device_buffer[:3, 0, :], want)
-
-
-def test_routed_experts_capturer_dp_modular_local_tokens():
-    """n == token_num_per_dp: topk is already local to this DP rank."""
-    capturer = _capturer_with_buffer(dp_rank=1)
-    num_tokens_dp = torch.tensor([2, 3], dtype=torch.int32)
-    ctx = SimpleNamespace(
-        dp_metadata=SimpleNamespace(num_tokens_across_dp_cpu=num_tokens_dp)
-    )
-    topk = torch.tensor([[10, 11], [12, 13], [14, 15]], dtype=torch.int32)
-    with patch(f"{_REC_MODULE}.get_forward_context", return_value=ctx):
-        capturer.capture(layer_id=0, topk_ids=topk)
-    assert torch.equal(capturer._device_buffer[:3, 0, :], topk)
-
-
-def test_routed_experts_capturer_dp_unexpected_batch_raises():
-    """Mismatch between topk batch dim and DP layout: fail fast."""
-    capturer = _capturer_with_buffer(dp_rank=0)
-    num_tokens_dp = torch.tensor([2, 3], dtype=torch.int32)
-    ctx = SimpleNamespace(
-        dp_metadata=SimpleNamespace(num_tokens_across_dp_cpu=num_tokens_dp)
-    )
-    # total=5, local=2: n=1 matches neither naive (5) nor modular (2).
-    topk = torch.tensor([[1, 2]], dtype=torch.int32)
-    with (
-        patch(f"{_REC_MODULE}.get_forward_context", return_value=ctx),
-        pytest.raises(AssertionError, match="unexpected topk_ids batch dim"),
-    ):
-        capturer.capture(layer_id=0, topk_ids=topk)
-    assert capturer._device_buffer[0, 0, 0].item() == -1
+        cache.get_or_grow_buffer("req1", max_pos=50)
+        cache.free_request("req1")
+        assert cache.get_buffer("req1") is None
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index fdd306633e38..45e693598f1b 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -928,6 +928,16 @@ def _granite4_vision_vllm_to_hf_output(vllm_output, model):
             ),
         ],
     ),
+    "qianfan_ocr": VLMTestInfo(
+        models=["baidu/Qianfan-OCR"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+        hf_model_kwargs=model_utils.qianfan_ocr_hf_model_kwargs("baidu/Qianfan-OCR"),
+    ),
     "qwen_vl": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 3ba665710af4..791bb3b3088f 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.video import sample_frames_from_video
 
 from ....conftest import VIDEO_ASSETS
@@ -11,6 +12,7 @@
 target_dtype = "bfloat16"
 
 VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
 
 
 def qwen2_5_vl_chat_template(*query):
@@ -28,6 +30,25 @@ def qwen2_5_vl_chat_template(*query):
 )
 
 
+WINDOW_ATTN_IMAGE_PROMPT = qwen2_5_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    "Describe the image.",
+)
+
+
+def _window_attention_regression_image():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image = ImageAsset("hato").pil_image
+    return image.resize((image.width // 2, image.height // 2))
+
+
+def _encoder_cudagraph_config(*, max_vision_items: int) -> dict:
+    return {
+        "cudagraph_mm_encoder": True,
+        "encoder_cudagraph_max_vision_items_per_batch": max_vision_items,
+    }
+
+
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("video_pruning_rate", [0.0, 0.75])
@@ -146,3 +167,77 @@ def test_qwen2_5_vl_evs_batched_videos(
 
             # Ensure the output is a string
             assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_window_attention_image(
+    vllm_runner,
+    model,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Regression test for Qwen2.5 window-attention image path."""
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = [WINDOW_ATTN_IMAGE_PROMPT]
+    images = [[_window_attention_regression_image()]]
+
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4096,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 1},
+        compilation_config=_encoder_cudagraph_config(max_vision_items=1),
+    ) as vllm_model:
+        outputs = vllm_model.generate_greedy(prompt, max_tokens, images=images)
+
+        assert len(outputs) == 1
+        output_ids, output_text = outputs[0]
+        assert len(output_ids) > 0
+        assert len(output_text) > 0
+        assert isinstance(output_text, str)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_qwen2_5_vl_window_attention_image_batch(
+    vllm_runner,
+    model,
+    dtype: str,
+    max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
+) -> None:
+    """Regression test window-attention with a small image batch."""
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    image = _window_attention_regression_image()
+    prompts = [WINDOW_ATTN_IMAGE_PROMPT, WINDOW_ATTN_IMAGE_PROMPT]
+    images = [[image], [image]]
+
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 1},
+        compilation_config=_encoder_cudagraph_config(max_vision_items=2),
+    ) as vllm_model:
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, images=images)
+
+        assert len(outputs) == 2
+        for output_ids, output_text in outputs:
+            assert len(output_ids) > 0
+            assert len(output_text) > 0
+            assert isinstance(output_text, str)
diff --git a/tests/models/multimodal/generation/test_vit_cudagraph.py b/tests/models/multimodal/generation/test_vit_cudagraph.py
index 7adea0771b6d..fb7bdfc8625d 100644
--- a/tests/models/multimodal/generation/test_vit_cudagraph.py
+++ b/tests/models/multimodal/generation/test_vit_cudagraph.py
@@ -54,7 +54,18 @@ def qwen_vl_chat_template(content: str) -> str:
         needs_video_metadata=True,
         marks=[pytest.mark.core_model],
     ),
-    # TODO: Add more models below.
+    "qwen2_5_vl": VitCudagraphTestConfig(
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
+        image_prompt=qwen_vl_chat_template(
+            "<|vision_start|><|image_pad|><|vision_end|>What is in this image?"
+        ),
+        video_prompt=qwen_vl_chat_template(
+            "<|vision_start|><|video_pad|><|vision_end|>"
+            "Describe this video in one sentence."
+        ),
+        needs_video_metadata=False,
+        marks=[pytest.mark.core_model],
+    ),
 }
 
 
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 66307de73127..62ea36061c9c 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -1554,3 +1554,94 @@ def _generate(
 
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
     return hf_model
+
+
+def qianfan_ocr_hf_model_kwargs(model_name: str) -> dict:
+    """Return hf_model_kwargs with a patched config for QianfanOCR."""
+    from vllm.transformers_utils.configs.qianfan_ocr import QianfanOCRConfig
+
+    config = QianfanOCRConfig.from_pretrained(model_name)
+    vc = config.vision_config
+    if isinstance(vc.image_size, int):
+        vc.image_size = (vc.image_size, vc.image_size)
+    if isinstance(vc.patch_size, int):
+        vc.patch_size = (vc.patch_size, vc.patch_size)
+    return {"config": config}
+
+
+def qianfan_ocr_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches an HfRunner instance to run QianfanOCR model inference.
+
+    QianfanOCR shares the same architecture as InternVLChatModel, so the
+    patching logic mirrors ``internvl_patch_hf_runner``.  The only difference
+    is that we load the config via vllm's registered ``QianfanOCRConfig``
+    instead of relying on ``trust_remote_code``.
+    """
+
+    class QianfanOCRProcessor:
+        def __init__(self, hf_runner: HfRunner):
+            self.tokenizer = hf_runner.tokenizer
+
+            from vllm.transformers_utils.configs.qianfan_ocr import QianfanOCRConfig
+
+            self.config = QianfanOCRConfig.from_pretrained(hf_runner.model_name)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+            # Compute num_image_token from config instead of model attribute,
+            # since the transformers-native model doesn't expose it.
+            image_size = self.config.force_image_size or self.vision_config.image_size
+            patch_size = self.vision_config.patch_size
+            downsample_ratio = self.config.downsample_ratio
+            self.num_image_token = int(
+                (image_size // patch_size) ** 2 * (downsample_ratio**2)
+            )
+
+        def __call__(
+            self,
+            text: str,
+            images: PIL.Image.Image | list[PIL.Image.Image] = None,
+            **kwargs,
+        ):
+            from vllm.transformers_utils.processors.internvl import (
+                image_to_pixel_values_internvl,
+            )
+
+            IMG_START = "<img>"
+            IMG_END = "</img>"
+            IMG_CONTEXT = "<IMG_CONTEXT>"
+
+            images = [images] if isinstance(images, PIL.Image.Image) else images
+            pixel_values_list = [
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                )
+                for image in images
+            ]
+            num_patches_list = [pv.shape[0] for pv in pixel_values_list]
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace("<image>", image_tokens, 1)
+
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = QianfanOCRProcessor(hf_model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
+    return hf_model
diff --git a/tests/models/multimodal/processing/test_gemma4.py b/tests/models/multimodal/processing/test_gemma4.py
index 808fab6a030f..8541701ae101 100644
--- a/tests/models/multimodal/processing/test_gemma4.py
+++ b/tests/models/multimodal/processing/test_gemma4.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Mapping
+
 import pytest
+from PIL import Image as PILImage
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 
@@ -12,6 +15,152 @@
 GEMMA4_MODEL_ID = "google/gemma-4-E2B-it"
 
 
+@pytest.mark.parametrize(
+    "image_width,image_height,max_soft_tokens",
+    [
+        # Production repro: a 3x900 image (extreme aspect ratio) made the
+        # prompt-side estimator return 289 while the HF Gemma 4 image
+        # processor's vision tower output capped at 280, producing the
+        # "Attempted to assign 280 multimodal tokens to 289 placeholders"
+        # mismatch that crashed EngineCore.
+        (900, 3, 280),
+        (3, 900, 280),
+        # Same pathology should hold for the video-frame budget (70 tokens).
+        (900, 3, 70),
+        # And for any other supported budget.
+        (4000, 2, 1120),
+    ],
+)
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_compute_num_soft_tokens_does_not_exceed_max_soft_tokens(
+    model_id: str,
+    image_width: int,
+    image_height: int,
+    max_soft_tokens: int,
+):
+    """Regression for the Gemma 3/4 multimodal crash.
+
+    `_compute_num_soft_tokens` must never return a value larger than
+    `max_soft_tokens`. The HF Gemma 4 image processor clamps its vision
+    tower output to that value; if the prompt-side estimator returns more,
+    the prompt has more `image` placeholder tokens than the encoder will
+    fill, and `_merge_multimodal_embeddings` raises `ValueError` deep in
+    the model forward.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    num_soft_tokens = processor.info._compute_num_soft_tokens(
+        image_width=image_width,
+        image_height=image_height,
+        max_soft_tokens=max_soft_tokens,
+    )
+
+    assert num_soft_tokens <= max_soft_tokens, (
+        f"_compute_num_soft_tokens returned {num_soft_tokens} for "
+        f"image_width={image_width}, image_height={image_height}, "
+        f"max_soft_tokens={max_soft_tokens} — exceeds the cap that the HF "
+        f"image processor enforces on its vision tower output. This is "
+        f"the placeholder/encoder count mismatch that crashes EngineCore."
+    )
+
+
+@pytest.mark.parametrize(
+    ("mm_processor_kwargs", "expected_image_tokens"),
+    [
+        ({}, 280),
+        ({"max_soft_tokens": 70}, 70),
+        ({"max_soft_tokens": 280}, 280),
+        ({"max_soft_tokens": 1120}, 1120),
+        ({"images_kwargs": {"max_soft_tokens": 560}}, 560),
+        ({"images_kwargs": None}, 280),
+        ({"images_kwargs": "not-a-dict"}, 280),
+    ],
+)
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_get_mm_max_tokens_per_item_respects_configured_max_soft_tokens(
+    model_id: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_image_tokens: int,
+):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt={"image": 1, "video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    tokens = processor.info.get_mm_max_tokens_per_item(
+        seq_len=ctx.model_config.max_model_len,
+        mm_counts={"image": 1, "video": 1},
+    )
+
+    assert tokens is not None
+    assert tokens["image"] == expected_image_tokens
+    assert tokens["video"] == 32 * (70 + 2 + 6)
+
+
+@pytest.mark.parametrize(
+    ("limit_mm_per_prompt", "expected_video_tokens"),
+    [
+        ({"video": 1}, 32 * (70 + 2 + 6)),
+        ({"video": {"count": 1}}, 32 * (70 + 2 + 6)),
+        ({"video": {"count": 1, "num_frames": 1}}, 1 * (70 + 2 + 6)),
+        ({"video": {"count": 1, "num_frames": 8}}, 8 * (70 + 2 + 6)),
+        ({"video": {"count": 1, "num_frames": 32}}, 32 * (70 + 2 + 6)),
+        ({"video": {"count": 1, "num_frames": 40}}, 32 * (70 + 2 + 6)),
+    ],
+)
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_get_mm_max_tokens_per_item_respects_configured_video_num_frames(
+    model_id: str,
+    limit_mm_per_prompt: Mapping[str, int | Mapping[str, int]],
+    expected_video_tokens: int,
+):
+    ctx = build_model_context(
+        model_id,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+
+    tokens = processor.info.get_mm_max_tokens_per_item(
+        seq_len=ctx.model_config.max_model_len,
+        mm_counts={"video": 1},
+    )
+
+    assert tokens is not None
+    assert tokens["image"] == 280
+    assert tokens["video"] == expected_video_tokens
+
+
+@pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
+def test_get_prompt_updates_respects_nested_max_soft_tokens(model_id: str):
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs={"images_kwargs": {"max_soft_tokens": 560}},
+        limit_mm_per_prompt={"image": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    image = PILImage.new("RGB", (1000, 1000), color="white")
+    image_size = image.size
+    mm_items = processor.info.parse_mm_data({"image": image})
+
+    prompt_update = processor._get_prompt_updates(mm_items, {}, {})[0]
+    replacement = prompt_update.resolve(0).content.full
+    expected = processor.info.get_image_repl(
+        image_width=image_size[0],
+        image_height=image_size[1],
+        processor=processor.info.get_hf_processor(),
+        max_soft_tokens=560,
+    ).full
+
+    assert replacement == expected
+
+
 @pytest.mark.parametrize("model_id", [GEMMA4_MODEL_ID])
 def test_limit_mm_per_prompt(
     image_assets: ImageTestAssets,
diff --git a/tests/models/multimodal/test_nano_nemotron_vl.py b/tests/models/multimodal/test_nano_nemotron_vl.py
new file mode 100644
index 000000000000..6922af79c08e
--- /dev/null
+++ b/tests/models/multimodal/test_nano_nemotron_vl.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.model_executor.models.nano_nemotron_vl import NemotronH_Nano_VL_V2
+
+
+class _TextOnlyMultiModalConfig:
+    def get_limit_per_prompt(self, modality: str) -> int:
+        return 0
+
+
+class _ImageOnlyMultiModalConfig:
+    def get_limit_per_prompt(self, modality: str) -> int:
+        return 1 if modality == "image" else 0
+
+
+class _ModelConfig:
+    multimodal_config = _TextOnlyMultiModalConfig()
+
+
+class _ImageOnlyModelConfig:
+    multimodal_config = _ImageOnlyMultiModalConfig()
+
+
+class _LanguageModel:
+    def __init__(self) -> None:
+        self.loaded_weights: list[tuple[str, object]] = []
+
+    def load_weights(self, weights):
+        self.loaded_weights = list(weights)
+
+
+class _MissingMultiModalModule:
+    def named_parameters(self):
+        raise AssertionError("multimodal weights should not be inspected")
+
+    def load_weights(self, weights):
+        raise AssertionError("multimodal weights should not be loaded")
+
+
+class _AdapterModule:
+    def named_parameters(self):
+        return []
+
+
+class _VisionModel:
+    def __init__(self) -> None:
+        self.loaded_weights: list[tuple[str, object]] = []
+
+    def load_weights(self, weights):
+        self.loaded_weights = list(weights)
+
+
+def test_nano_nemotron_vl_skips_multimodal_weights_in_text_only_mode():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    object.__setattr__(model, "model_config", _ModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", _MissingMultiModalModule())
+    object.__setattr__(model, "sound_encoder", None)
+
+    language_weight = object()
+    model.load_weights(
+        [
+            ("language_model.layers.0.weight", language_weight),
+            ("mlp1.0.weight", object()),
+            ("vision_model.radio_model.encoder.weight", object()),
+            ("sound_encoder.encoder.weight", object()),
+        ]
+    )
+
+    assert language_model.loaded_weights == [("layers.0.weight", language_weight)]
+
+
+def test_nano_nemotron_vl_loads_vision_weights_without_sound_encoder():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    vision_model = _VisionModel()
+    object.__setattr__(model, "model_config", _ImageOnlyModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", vision_model)
+    object.__setattr__(model, "sound_encoder", None)
+
+    language_weight = object()
+    vision_weight = object()
+    model.load_weights(
+        [
+            ("language_model.layers.0.weight", language_weight),
+            ("vision_model.radio_model.encoder.weight", vision_weight),
+        ]
+    )
+
+    assert language_model.loaded_weights == [("layers.0.weight", language_weight)]
+    assert vision_model.loaded_weights == [
+        ("radio_model.encoder.weight", vision_weight)
+    ]
+
+
+def test_nano_nemotron_vl_requires_sound_encoder_for_sound_weights():
+    model = object.__new__(NemotronH_Nano_VL_V2)
+    language_model = _LanguageModel()
+    vision_model = _VisionModel()
+    object.__setattr__(model, "model_config", _ImageOnlyModelConfig())
+    object.__setattr__(model, "language_model", language_model)
+    object.__setattr__(model, "mlp1", _AdapterModule())
+    object.__setattr__(model, "vision_model", vision_model)
+    object.__setattr__(model, "sound_encoder", None)
+
+    with pytest.raises(AssertionError):
+        model.load_weights([("sound_encoder.encoder.weight", object())])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 21d3a50ce996..e50b0a8de4d9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -946,13 +946,6 @@ def check_available_online(
     "HCXVisionForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
-        max_transformers_version="4.57",
-        transformers_version_reason={
-            "vllm": (
-                "Custom config cannot be loaded with Transformers "
-                "v5 because `text_config` is not always set"
-            )
-        },
     ),
     "HCXVisionV2ForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-32B",
@@ -1148,30 +1141,17 @@ def check_available_online(
     "NemotronH_Nano_VL_V2": _HfExamplesInfo(
         "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
         max_model_len=4096,
-        # NemotronH layers are constructed via `hybrid_override_pattern`:
+        # NemotronH layers are constructed via `hybrid_override_pattern`
         use_original_num_layers=True,
         hf_overrides={
-            "vision_config": PretrainedConfig(
-                args={
-                    "min_num_patches": 1,  # Trigger image dynamic res
-                    "max_num_patches": 12,
-                    "model": "vit_huge_patch16_224",
-                },
-                # Trigger conv3d:
-                video_temporal_patch_size=2,
-            ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
+            "text_config": {"num_hidden_layers": 2, "hybrid_override_pattern": "M*"},
         },
         trust_remote_code=True,
     ),
-    # NemotronH_Nano_Omni_Reasoning_V3 is an alias for NemotronH_Nano_VL_V2
-    # Use the same registry test as NemotronH_Nano_VL_V2 above
     "NemotronH_Nano_Omni_Reasoning_V3": _HfExamplesInfo(
-        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16",
         max_model_len=4096,
+        # NemotronH layers are constructed via `hybrid_override_pattern`
         use_original_num_layers=True,
         hf_overrides={
             "vision_config": PretrainedConfig(
@@ -1181,35 +1161,17 @@ def check_available_online(
                     "model": "vit_huge_patch16_224",
                 },
                 video_temporal_patch_size=2,
+                # TODO(nhaber): This is `true` in the official `config.json`,
+                # but this causes a processor exception in the tests due to a known bug
+                # with mixed-resolution video when `true`. To be resolved.
+                video_maintain_aspect_ratio=False,
             ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
+            "text_config": {"num_hidden_layers": 2, "hybrid_override_pattern": "M*"},
         },
         trust_remote_code=True,
     ),
-    # NemotronH_Super_Omni_Reasoning_V3 is an alias for NemotronH_Nano_VL_V2 as well
-    # Use the same registry test as NemotronH_Nano_VL_V2 above
     "NemotronH_Super_Omni_Reasoning_V3": _HfExamplesInfo(
-        "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
-        max_model_len=4096,
-        use_original_num_layers=True,
-        hf_overrides={
-            "vision_config": PretrainedConfig(
-                args={
-                    "min_num_patches": 1,
-                    "max_num_patches": 12,
-                    "model": "vit_huge_patch16_224",
-                },
-                video_temporal_patch_size=2,
-            ),
-            "text_config": {
-                "num_hidden_layers": 2,
-                "hybrid_override_pattern": "M*",
-            },
-        },
-        trust_remote_code=True,
+        "nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16", is_available_online=False
     ),
     "OpenCUAForConditionalGeneration": _HfExamplesInfo(
         "xlangai/OpenCUA-7B",
@@ -1302,6 +1264,10 @@ def check_available_online(
         },
         tokenizer_mode="mistral",
     ),
+    "QianfanOCRForConditionalGeneration": _HfExamplesInfo(
+        "baidu/Qianfan-OCR",
+        min_transformers_version="5.6.0",
+    ),
     "QwenVLForConditionalGeneration": _HfExamplesInfo(
         "Qwen/Qwen-VL",
         extras={"chat": "Qwen/Qwen-VL-Chat"},
@@ -1528,6 +1494,12 @@ def check_available_online(
         trust_remote_code=True,
         is_available_online=False,
     ),
+    "Gemma4MTPModel": _HfExamplesInfo(
+        "google/gemma-4-E4B-it",
+        speculative_model="google/gemma-4-E4B-it-assistant",
+        trust_remote_code=True,
+        min_transformers_version="5.8.0",
+    ),
     "ErnieMTPModel": _HfExamplesInfo(
         "baidu/ERNIE-4.5-21B-A3B-PT",
         trust_remote_code=True,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index b12ab72d77c7..a5d1844a3071 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import warnings
-from collections.abc import Sequence
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
 from typing import Any
 
@@ -277,7 +277,7 @@ def build_model_context(
     dtype: ModelDType = "auto",
     model_config_kwargs: dict[str, Any] | None = None,
     mm_processor_kwargs: dict[str, Any] | None = None,
-    limit_mm_per_prompt: dict[str, int] | None = None,
+    limit_mm_per_prompt: Mapping[str, int | Mapping[str, int]] | None = None,
     mm_processor_cache_gb: int = 0,
 ):
     """Creates an InputProcessingContext for a given model.
@@ -300,7 +300,10 @@ def build_model_context(
     )
 
     model_config_kwargs = model_config_kwargs or {}
-    limit_mm_per_prompt = limit_mm_per_prompt or {}
+    limit_mm_per_prompt = {
+        modality: dict(limit) if isinstance(limit, Mapping) else limit
+        for modality, limit in (limit_mm_per_prompt or {}).items()
+    }
     model_config = ModelConfig(
         model_id,
         runner=runner,
diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/tests/parser/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/parser/test_streaming.py b/tests/parser/test_streaming.py
new file mode 100644
index 000000000000..d9194d48ed5a
--- /dev/null
+++ b/tests/parser/test_streaming.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.parser.abstract_parser import _WrappedParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+
+
+class ThinkReasoningParser(BaseThinkingReasoningParser):
+    @property
+    def start_token(self) -> str:
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        return "</think>"
+
+
+MODEL_OUTPUT = (
+    "<think>let me think about this</think>"
+    '<tool_call>\n{"name": "get_weather", '
+    '"arguments": {"city": "Dallas"}}\n</tool_call>'
+)
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    from vllm.tokenizers import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def request_obj():
+    return ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "hi"}],
+    )
+
+
+def make_parser(tokenizer, reasoning=False, tool=False):
+    _WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None
+    _WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None
+    return _WrappedParser(tokenizer)
+
+
+def stream_text(parser, tokenizer, text, request, prompt_token_ids=None):
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    results: list[DeltaMessage | None] = []
+    for tid in token_ids:
+        delta_text = tokenizer.decode([tid])
+        result = parser.parse_delta(
+            delta_text, [tid], request, prompt_token_ids=prompt_token_ids
+        )
+        prompt_token_ids = None
+        results.append(result)
+    return results
+
+
+def collect_fields(results):
+    all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning)
+    all_content = "".join(r.content for r in results if r and r.content)
+    all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls]
+    return all_reasoning, all_content, all_tool_calls
+
+
+def test_parse_delta_neither_parser(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert len(tool_calls) == 0
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+
+
+def test_parse_delta_tool_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "</think>" in content
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert len(tool_calls) == 0
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+    assert "</tool_call>" in content
+
+
+def test_parse_delta_both_parsers(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert content == ""
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj):
+    """Regression test for vllm-project/vllm#40466.
+
+    When enable_thinking=False, the chat template places <think>\\n\\n</think>
+    in the prompt. The model then generates pure content (no think tokens).
+    All streaming output must go to delta.content, not delta.reasoning.
+    """
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+
+    end_token_id = parser._reasoning_parser.end_token_id
+    prompt_token_ids = [1, 2, end_token_id, 3]
+
+    content_text = "Hello! How can I assist you today?"
+    results = stream_text(
+        parser,
+        tokenizer,
+        content_text,
+        request_obj,
+        prompt_token_ids=prompt_token_ids,
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}"
+    assert "Hello" in content
+    assert "assist" in content
+    assert len(tool_calls) == 0
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 6b95d9e346db..7e80b0ecfa61 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -32,6 +32,9 @@
     CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16,
 )
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target,
+)
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (
     cutlass_fp4_supported,
@@ -635,6 +638,24 @@ def test_get_quant_method_returns_none_for_unmatched_parallel_lm_head():
     )
 
 
+def test_find_matched_target_returns_none_on_no_match():
+    result = find_matched_target(
+        layer_name="model.layers.0.self_attn.qkv_proj",
+        module=Mock(spec=torch.nn.Linear),
+        targets=["no_match_target"],
+    )
+    assert result is None
+
+
+def test_get_scheme_dict_returns_none_on_no_match():
+    config = _make_ct_config(target="matched_layer")
+    result = config.get_scheme_dict(
+        layer=Mock(spec=torch.nn.Linear),
+        layer_name="model.layers.0.unmatched_layer",
+    )
+    assert result is None
+
+
 @pytest.mark.skipif(
     not current_platform.is_cuda() or not current_platform.has_device_capability(75),
     reason="MXFP8 requires Turing (sm_75+) or newer.",
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 3b58614e58d4..151b5d97ddf3 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -70,4 +70,5 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
         ["--enforce_eager"],
         ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
+        include_seeded_sampling=False,
     )
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
index 5520dc1747ad..1fbe22201952 100644
--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -13,6 +13,8 @@
     "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",  # without g_idx
     "RedHatAI/Qwen3-1.7B-quantized.w4a16",  # with zp
     "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",
+    "Qwen/Qwen3-0.6B-FP8",  # FP8 W8A16 block-quantized linear
+    "Qwen/Qwen3-30B-A3B-FP8",  # FP8 W8A16 block-quantized MoE
 ]
 DTYPE = ["bfloat16"]
 
diff --git a/tests/quantization/test_turboquant.py b/tests/quantization/test_turboquant.py
index f074ce119ae8..b9567195b3a8 100644
--- a/tests/quantization/test_turboquant.py
+++ b/tests/quantization/test_turboquant.py
@@ -182,22 +182,100 @@ def test_all_presets_all_head_dims(self, preset, head_dim):
 
     # ---- Boundary skip layers ----
 
+    @staticmethod
+    def _dense_model_config(num_layers):
+        from types import SimpleNamespace
+
+        return SimpleNamespace(
+            is_hybrid=False,
+            hf_text_config=SimpleNamespace(num_hidden_layers=num_layers),
+        )
+
     def test_boundary_skip_layers_basic(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(32)
+        mc = self._dense_model_config(32)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc)
         assert layers == ["0", "1", "30", "31"]
 
     def test_boundary_skip_layers_zero(self):
-        assert TurboQuantConfig.get_boundary_skip_layers(32, 0) == []
+        mc = self._dense_model_config(32)
+        assert TurboQuantConfig.get_boundary_skip_layers(mc, 0) == []
 
     def test_boundary_skip_layers_small_model(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(4)
+        mc = self._dense_model_config(4)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc)
         assert layers == ["0", "1", "2", "3"]
 
     def test_boundary_skip_layers_cap_at_half(self):
-        layers = TurboQuantConfig.get_boundary_skip_layers(8, 10)
+        mc = self._dense_model_config(8)
+        layers = TurboQuantConfig.get_boundary_skip_layers(mc, 10)
         assert len(layers) == 8
 
 
+class TestHybridAttentionIndices:
+    """Regression tests for boundary protection on hybrid models.
+
+    Hybrid models (attention + Mamba / linear-attention) identify KV-carrying
+    layers via layer_types / layers_block_type / attn_type_list. The helper
+    must return the *global* layer indices of the full-attention layers so
+    that kv_cache_dtype_skip_layers matches what extract_layer_index(prefix)
+    reports on the Attention layers at runtime.
+    """
+
+    @staticmethod
+    def _fake_model_config(text_cfg=None, hf_cfg=None):
+        from types import SimpleNamespace
+
+        return SimpleNamespace(
+            hf_text_config=text_cfg if text_cfg is not None else SimpleNamespace(),
+            hf_config=hf_cfg if hf_cfg is not None else SimpleNamespace(),
+        )
+
+    def test_layer_types_full_attention(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        cfg = type("C", (), {})()
+        cfg.layer_types = [
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+            "linear_attention",
+            "full_attention",
+            "full_attention",
+        ]
+        mc = self._fake_model_config(text_cfg=cfg)
+        assert _get_full_attention_layer_indices(mc) == [2, 4, 5]
+
+    def test_layers_block_type_jamba(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        cfg = type("C", (), {})()
+        cfg.layers_block_type = ["mamba", "attention", "mamba", "attention"]
+        mc = self._fake_model_config(text_cfg=cfg)
+        assert _get_full_attention_layer_indices(mc) == [1, 3]
+
+    def test_attn_type_list_minimax(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        hf = type("C", (), {})()
+        hf.attn_type_list = [0, 1, 0, 1, 1]
+        mc = self._fake_model_config(hf_cfg=hf)
+        assert _get_full_attention_layer_indices(mc) == [1, 3, 4]
+
+    def test_no_hybrid_hints_returns_empty(self):
+        from vllm.model_executor.layers.quantization.turboquant.config import (
+            _get_full_attention_layer_indices,
+        )
+
+        mc = self._fake_model_config()
+        assert _get_full_attention_layer_indices(mc) == []
+
+
 # ============================================================================
 # Centroids tests (CPU-only)
 # ============================================================================
diff --git a/tests/reasoning/test_kimi_k2_reasoning_parser.py b/tests/reasoning/test_kimi_k2_reasoning_parser.py
index 0f80bb8854a8..dfce2075c6a9 100644
--- a/tests/reasoning/test_kimi_k2_reasoning_parser.py
+++ b/tests/reasoning/test_kimi_k2_reasoning_parser.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock
+
 import pytest
 
 from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
@@ -12,6 +14,20 @@
 REASONING_MODEL_NAME = "moonshotai/Kimi-K2.5"
 
 
+@pytest.fixture
+def mock_kimi_k2_tokenizer():
+    tokenizer = MagicMock()
+    tokenizer.get_vocab.return_value = {
+        "<think>": 100,
+        "</think>": 101,
+        "<|tool_calls_section_begin|>": 200,
+        "<|tool_calls_section_end|>": 201,
+        "<|tool_call_begin|>": 202,
+        "<|tool_call_end|>": 203,
+    }
+    return tokenizer
+
+
 @pytest.fixture(scope="module")
 def kimi_k2_tokenizer():
     return get_tokenizer(tokenizer_name=REASONING_MODEL_NAME, trust_remote_code=True)
@@ -153,3 +169,50 @@ def test_streaming_tool_section_ends_reasoning(kimi_k2_tokenizer):
     )
     assert isinstance(result, DeltaMessage)
     assert result.content == "<|tool_calls_section_begin|>"
+
+
+def test_streaming_end_token_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, </think> ID arrives before its text.
+
+    The token ID is present in delta_token_ids but the actual string is not
+    yet in delta_text (still buffered). The parser must return None to wait
+    for the next delta, instead of calling find() which returns -1 and
+    silently corrupting the text split.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    end_think_id = parser._end_token_id
+
+    # Simulate: </think> ID arrived but text not yet flushed.
+    # Two token IDs in delta to bypass the single-special-token guard.
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # </think> text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, end_think_id, 999],
+        delta_token_ids=[end_think_id, 999],
+    )
+    assert result is None
+
+
+def test_streaming_tool_section_id_buffered(mock_kimi_k2_tokenizer):
+    """When stop sequences buffer text, tool section start ID arrives before its text.
+
+    Same buffering scenario as above but for <|tool_calls_section_begin|>.
+    Without the guard, find() returns -1 and delta_text[:tool_index] silently
+    drops the last character of reasoning.
+    """
+    parser = KimiK2ReasoningParser(mock_kimi_k2_tokenizer)
+    think_id = parser._start_token_id
+    tool_begin_id = parser._tool_section_start_token_id
+
+    result = parser.extract_reasoning_streaming(
+        previous_text="some reasoning",
+        current_text="some reasoning extra",
+        delta_text="extra",  # tool section text not yet flushed
+        previous_token_ids=[think_id],
+        current_token_ids=[think_id, tool_begin_id, 999],
+        delta_token_ids=[tool_begin_id, 999],
+    )
+    assert result is None
diff --git a/tests/test_config.py b/tests/test_config.py
index 41d34a6cb06b..57d1e1bc686b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1215,8 +1215,6 @@ def test_scheduler_config_init():
         ("facebook/opt-125m", 1, False, False),
         # Non-MoE model with DP>1 internal LB should need coordinator
         ("facebook/opt-125m", 2, False, True),
-        # Non-MoE model with DP>1 external LB should not need coordinator
-        ("facebook/opt-125m", 2, True, False),
         # MoE model with DP=1 should not need coordinator
         ("mistralai/Mixtral-8x7B-Instruct-v0.1", 1, False, False),
         # MoE model with DP>1 internal LB should need both coordinator
@@ -1295,11 +1293,14 @@ def test_ir_op_priority_default():
     # Assert default is applied to ops
     priority_config = IrOpPriorityConfig.with_default(["vllm_c", "native"])
     assert priority_config.rms_norm == ["vllm_c", "native"]
+    assert priority_config.fused_add_rms_norm == ["vllm_c", "native"]
 
     # Assert single ops override the default
-    assert IrOpPriorityConfig.with_default(
-        ["vllm_c", "native"], rms_norm=["oink", "native"]
-    ) == IrOpPriorityConfig(rms_norm=["oink", "native"])
+    priority_config = IrOpPriorityConfig.with_default(
+        ["native"], rms_norm=["oink", "native"]
+    )
+    assert priority_config.rms_norm == ["oink", "native"]
+    assert priority_config.fused_add_rms_norm == ["native"]
 
 
 def test_ir_op_priority_str():
@@ -1318,3 +1319,34 @@ def test_ir_op_priority_str():
     with pytest.raises(pydantic.ValidationError):
         # must be list of only strings
         priority_config = IrOpPriorityConfig(rms_norm=["vllm_c", 4, "native"])
+
+
+def test_ir_op_priority_ctx():
+    """Test that the priority-setting context sets priority correctly."""
+    from vllm import ir
+    from vllm.config.kernel import IrOpPriorityConfig
+
+    priority = IrOpPriorityConfig.with_default(["native"], rms_norm=["vllm_c"])
+    priority2 = IrOpPriorityConfig.with_default(
+        ["native"], fused_add_rms_norm=["vllm_c"]
+    )
+    with priority.set_priority():
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
+        with priority2.set_priority():
+            assert ir.ops.rms_norm.get_priority() == ["native"]
+            assert ir.ops.fused_add_rms_norm.get_priority() == ["vllm_c", "native"]
+
+        # context restored
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
+
+        with pytest.raises(ValueError), priority2.set_priority():
+            assert ir.ops.rms_norm.get_priority() == ["native"]
+            assert ir.ops.fused_add_rms_norm.get_priority() == ["vllm_c", "native"]
+
+            raise ValueError
+
+        # context restored even after exception
+        assert ir.ops.rms_norm.get_priority() == ["vllm_c", "native"]
+        assert ir.ops.fused_add_rms_norm.get_priority() == ["native"]
diff --git a/tests/test_jit_monitor.py b/tests/test_jit_monitor.py
new file mode 100644
index 000000000000..a463f4b5faa1
--- /dev/null
+++ b/tests/test_jit_monitor.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import sys
+from types import SimpleNamespace
+from unittest import mock
+
+import pytest
+
+from vllm.triton_utils import jit_monitor
+
+
+@pytest.fixture(autouse=True)
+def _reset_monitor():
+    """Reset global monitor state between tests."""
+    jit_monitor._active = False
+    yield
+    jit_monitor._active = False
+
+
+# ------------------------------------------------------------------
+# Helpers — lightweight stand-ins for triton.knobs
+# ------------------------------------------------------------------
+
+
+def _make_fake_knobs(*, autotuning_print=False, jit_hook=None):
+    """Build a minimal fake ``triton.knobs`` namespace."""
+    autotuning = SimpleNamespace(print=autotuning_print)
+    runtime = SimpleNamespace(jit_post_compile_hook=jit_hook)
+    return SimpleNamespace(autotuning=autotuning, runtime=runtime)
+
+
+def _patch_triton_knobs(fake_knobs):
+    """Context manager that makes ``from triton import knobs`` return *fake_knobs*."""
+    fake_triton = SimpleNamespace(knobs=fake_knobs)
+    return mock.patch.dict(sys.modules, {"triton": fake_triton})
+
+
+# ------------------------------------------------------------------
+# Unit tests (no GPU required, triton is mocked)
+# ------------------------------------------------------------------
+
+
+class TestActivateBasic:
+    def test_sets_active(self):
+        assert not jit_monitor.is_active()
+        with _patch_triton_knobs(_make_fake_knobs()):
+            jit_monitor.activate()
+        assert jit_monitor.is_active()
+
+    def test_idempotent(self):
+        fake = _make_fake_knobs()
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+            first_hook = fake.runtime.jit_post_compile_hook
+            jit_monitor.activate()
+            assert fake.runtime.jit_post_compile_hook is first_hook
+
+    def test_logs_info_on_activation(self):
+        with (
+            mock.patch.object(jit_monitor.logger, "info") as m,
+            _patch_triton_knobs(_make_fake_knobs()),
+        ):
+            jit_monitor.activate()
+        m.assert_called_once()
+        assert "Kernel JIT monitor activated" in m.call_args[0][0]
+
+
+class TestAutotuningPrint:
+    def test_enables_autotuning_print(self):
+        fake = _make_fake_knobs(autotuning_print=False)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+        assert fake.autotuning.print is True
+
+    def test_respects_user_opt_out(self):
+        fake = _make_fake_knobs(autotuning_print=False)
+        with (
+            mock.patch.dict(os.environ, {"TRITON_PRINT_AUTOTUNING": "0"}),
+            _patch_triton_knobs(fake),
+        ):
+            jit_monitor.activate()
+        assert fake.autotuning.print is False
+
+    def test_noop_when_user_already_enabled(self):
+        fake = _make_fake_knobs(autotuning_print=True)
+        with (
+            mock.patch.dict(os.environ, {"TRITON_PRINT_AUTOTUNING": "1"}),
+            _patch_triton_knobs(fake),
+        ):
+            jit_monitor.activate()
+        assert fake.autotuning.print is True
+
+
+class TestJitHook:
+    def test_hook_registered(self):
+        fake = _make_fake_knobs()
+        assert fake.runtime.jit_post_compile_hook is None
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+        assert fake.runtime.jit_post_compile_hook is not None
+
+    def test_hook_logs_warning(self):
+        fake = _make_fake_knobs()
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="test_kernel")
+
+        with mock.patch.object(jit_monitor.logger, "warning") as m:
+            hook(
+                key="some_key",
+                repr="some_repr",
+                fn=mock_fn,
+                compile=lambda: None,
+                is_manual_warmup=False,
+                already_compiled=False,
+            )
+
+        m.assert_called_once()
+        msg = m.call_args[0][0] % m.call_args[0][1:]
+        assert "Triton kernel JIT compilation during inference" in msg
+        assert "test_kernel" in msg
+
+    def test_hook_chains_existing_hook(self):
+        existing = mock.MagicMock(return_value="existing_result")
+        fake = _make_fake_knobs(jit_hook=existing)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="chained_kernel")
+        kwargs = dict(
+            key="k",
+            repr="r",
+            fn=mock_fn,
+            compile=lambda: None,
+            is_manual_warmup=False,
+            already_compiled=False,
+        )
+        result = hook(**kwargs)
+
+        existing.assert_called_once()
+        assert result == "existing_result"
+
+    def test_hook_works_without_existing_hook(self):
+        fake = _make_fake_knobs(jit_hook=None)
+        with _patch_triton_knobs(fake):
+            jit_monitor.activate()
+
+        hook = fake.runtime.jit_post_compile_hook
+        mock_fn = SimpleNamespace(name="solo_kernel")
+        result = hook(
+            key="k",
+            repr="r",
+            fn=mock_fn,
+            compile=lambda: None,
+            is_manual_warmup=False,
+            already_compiled=False,
+        )
+        assert result is None
+
+
+class TestNoTritonFallback:
+    def test_activate_without_triton(self):
+        with mock.patch.object(jit_monitor, "HAS_TRITON", False):
+            jit_monitor.activate()
+        assert jit_monitor.is_active()
+
+
+# ------------------------------------------------------------------
+# Integration tests (real Triton + GPU)
+# ------------------------------------------------------------------
+
+try:
+    import torch
+
+    _HAS_CUDA = torch.cuda.is_available()
+except ImportError:
+    _HAS_CUDA = False
+
+try:
+    import triton
+    import triton.language as tl
+
+    _HAS_TRITON = True
+except ImportError:
+    _HAS_TRITON = False
+
+_skip_no_gpu = pytest.mark.skipif(
+    not (_HAS_CUDA and _HAS_TRITON),
+    reason="Requires CUDA GPU and Triton",
+)
+
+
+if _HAS_TRITON:
+
+    @triton.jit
+    def _add_kernel(x_ptr, y_ptr, out_ptr, n, BLOCK: tl.constexpr):
+        pid = tl.program_id(0)
+        offs = pid * BLOCK + tl.arange(0, BLOCK)
+        mask = offs < n
+        x = tl.load(x_ptr + offs, mask=mask)
+        y = tl.load(y_ptr + offs, mask=mask)
+        tl.store(out_ptr + offs, x + y, mask=mask)
+
+
+def _run_add_kernel(n: int, block: int = 256) -> None:
+    """Launch ``_add_kernel`` with vectors of length *n*."""
+    x = torch.randn(n, device="cuda")
+    y = torch.randn(n, device="cuda")
+    out = torch.empty(n, device="cuda")
+    grid = ((n + block - 1) // block,)
+    _add_kernel[grid](x, y, out, n, BLOCK=block)
+    torch.accelerator.synchronize()
+
+
+@_skip_no_gpu
+class TestTritonJitHookIntegration:
+    """End-to-end: real Triton kernel, real GPU, real hook."""
+
+    def test_no_warning_on_cached_shape(self):
+        _run_add_kernel(1024)
+
+        jit_monitor.activate()
+        with mock.patch.object(jit_monitor.logger, "warning") as w:
+            _run_add_kernel(1024)
+        w.assert_not_called()
+
+    def test_warning_on_new_constexpr(self):
+        _run_add_kernel(1024, block=256)
+
+        jit_monitor.activate()
+        with mock.patch.object(jit_monitor.logger, "warning") as w:
+            # Different BLOCK (a tl.constexpr) forces recompilation.
+            _run_add_kernel(1024, block=512)
+        w.assert_called()
+        msg = w.call_args[0][0] % w.call_args[0][1:]
+        assert "_add_kernel" in msg
diff --git a/tests/tokenizers_/test_mistral.py b/tests/tokenizers_/test_mistral.py
index 2b101e8f98d9..2023337e8577 100644
--- a/tests/tokenizers_/test_mistral.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import copy
 from typing import Any
 
 import llguidance
@@ -11,353 +12,34 @@
 
 from vllm.tokenizers.mistral import (
     MistralTokenizer,
-    _prepare_apply_chat_template_tools_and_messages,
+    _validate_apply_chat_template_args,
 )
 
 
-@pytest.mark.parametrize(
-    "openai_request,expected_mistral_output",
-    [
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    }
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "unsupported_field": False,
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "unsupported_field2": False,
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                ],
-            ),
-        ),
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "unsupported_field": False,
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "unsupported_field2": False,
-                        "function": {
-                            "description": "Fetch the current local date and time 2.",
-                            "name": "get_current_time2",
-                            "parameters": {"a": "1"},
-                        },
-                    },
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What is the current local date and time?",
-                    }
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time.",
-                            "name": "get_current_time",
-                            "parameters": {},
-                        },
-                    },
-                    {
-                        "type": "function",
-                        "function": {
-                            "description": "Fetch the current local date and time 2.",
-                            "name": "get_current_time2",
-                            "parameters": {"a": "1"},
-                        },
-                    },
-                ],
-            ),
-        ),
-    ],
-)
-def test_prepare_apply_chat_template_tools_and_messages(
-    openai_request, expected_mistral_output
-):
-    actual_request = _prepare_apply_chat_template_tools_and_messages(
-        openai_request["messages"], openai_request["tools"]
-    )
-    assert actual_request == expected_mistral_output
-
-
-# Tool use with list content and reasoning
-@pytest.mark.parametrize(
-    "openai_request,expected_mistral_output",
-    [
-        (
-            {
-                "messages": [
-                    {
-                        "role": "user",
-                        "content": "What's the weather in Paris?",
-                    },
-                    {
-                        "role": "assistant",
-                        "reasoning": None,
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call123",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"city": "Paris"}',
-                                },
-                            }
-                        ],
-                    },
-                    {
-                        "role": "tool",
-                        "content": [{"type": "text", "text": "Rainy"}],
-                        "name": "get_weather",
-                        "tool_call_id": "call123",
-                    },
-                ],
-                "tools": [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": "get_weather",
-                            "description": "Gets the current weather in a city.",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {
-                                    "city": {
-                                        "type": "string",
-                                        "description": "The city name",
-                                    }
-                                },
-                                "required": ["city"],
-                            },
-                        },
-                    }
-                ],
-            },
-            (
-                [
-                    {
-                        "role": "user",
-                        "content": "What's the weather in Paris?",
-                    },
-                    {
-                        "role": "assistant",
-                        "content": None,
-                        "tool_calls": [
-                            {
-                                "id": "call123",
-                                "type": "function",
-                                "function": {
-                                    "name": "get_weather",
-                                    "arguments": '{"city": "Paris"}',
-                                },
-                            }
-                        ],
-                    },
-                    {
-                        "role": "tool",
-                        "content": [{"type": "text", "text": "Rainy"}],
-                        "name": "get_weather",
-                        "tool_call_id": "call123",
-                    },
-                ],
-                [
-                    {
-                        "type": "function",
-                        "function": {
-                            "name": "get_weather",
-                            "description": "Gets the current weather in a city.",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {
-                                    "city": {
-                                        "type": "string",
-                                        "description": "The city name",
-                                    }
-                                },
-                                "required": ["city"],
-                            },
-                        },
-                    }
-                ],
-            ),
-        )
-    ],
-)
-def test_prepare_apply_chat_template_tools_and_messages_list_content(
-    openai_request, expected_mistral_output
-):
-    actual_request = _prepare_apply_chat_template_tools_and_messages(
-        openai_request["messages"], openai_request["tools"]
-    )
-    assert actual_request == expected_mistral_output
-
-
-def test_prepare_apply_chat_template_generation_prompt_and_continue():
+def test_validate_apply_chat_template_args():
+    # add_generation_prompt with assistant last message → error
     messages = [{"role": "assistant", "content": "Hello"}]
-    tools: list[dict[str, Any]] = []
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=True
-        )
+        _validate_apply_chat_template_args(messages, add_generation_prompt=True)
 
+    # add_generation_prompt with user last message → ok
     messages = [{"role": "user", "content": "Hello"}]
-    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
-        messages, tools, add_generation_prompt=True
-    )
-    assert out_messages == [{"role": "user", "content": "Hello"}]
+    _validate_apply_chat_template_args(messages, add_generation_prompt=True)
 
+    # both add_generation_prompt and continue_final_message → error
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=True, continue_final_message=True
+        _validate_apply_chat_template_args(
+            messages, add_generation_prompt=True, continue_final_message=True
         )
 
+    # continue_final_message with assistant last message → ok
     messages = [{"role": "assistant", "content": "Hello"}]
-    out_messages, _ = _prepare_apply_chat_template_tools_and_messages(
-        messages, tools, add_generation_prompt=False, continue_final_message=True
-    )
-    assert out_messages == [{"role": "assistant", "content": "Hello"}]
+    _validate_apply_chat_template_args(messages, continue_final_message=True)
 
+    # continue_final_message with user last message → error
     messages = [{"role": "user", "content": "Hello"}]
     with pytest.raises(ValueError):
-        _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, add_generation_prompt=False, continue_final_message=True
-        )
+        _validate_apply_chat_template_args(messages, continue_final_message=True)
 
 
 @pytest.fixture(scope="module")
@@ -2435,3 +2117,120 @@ def test_llg_tokenizer(self, mistral_tokenizer: MistralTokenizer) -> None:
         # Test caching
         llg_tokenizer_2 = mistral_tokenizer.llg_tokenizer
         assert llg_tokenizer is llg_tokenizer_2
+
+    @pytest.mark.parametrize(
+        "messages,tools,tekken_expected_substrings,spm_expected_substrings",
+        [
+            pytest.param(
+                [{"role": "user", "content": "Hello"}],
+                [{"type": "function", "function": {"name": "do_nothing"}}],
+                ["do_nothing", '"description": ""', '"parameters": {}'],
+                ["do_nothing", '"description":▁""', '"parameters":▁{}'],
+                id="tool_without_description_and_parameters",
+            ),
+            pytest.param(
+                [
+                    {"role": "user", "content": "Do nothing"},
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "id": "123456789",
+                                "type": "function",
+                                "function": {
+                                    "name": "do_nothing",
+                                    "arguments": None,
+                                },
+                            }
+                        ],
+                    },
+                    {
+                        "role": "tool",
+                        "tool_call_id": "123456789",
+                        "content": "done",
+                    },
+                ],
+                [{"type": "function", "function": {"name": "do_nothing"}}],
+                ["do_nothing"],
+                ["do_nothing"],
+                id="tool_call_with_none_arguments",
+            ),
+        ],
+    )
+    def test_apply_chat_template_tool_optional_fields(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+        messages: list[dict[str, Any]],
+        tools: list[dict[str, Any]],
+        tekken_expected_substrings: list[str],
+        spm_expected_substrings: list[str],
+    ) -> None:
+        output = mistral_tokenizer.apply_chat_template(
+            messages, tools=tools, add_generation_prompt=True
+        )
+        decoded = mistral_tokenizer.tokenizer.decode(output, SpecialTokenPolicy.KEEP)
+
+        expected = (
+            tekken_expected_substrings
+            if mistral_tokenizer.is_tekken
+            else spm_expected_substrings
+        )
+        for substring in expected:
+            assert substring in decoded
+
+    def test_apply_chat_template_tools_not_mutated(
+        self, mistral_tokenizer: MistralTokenizer
+    ) -> None:
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "Hello"},
+        ]
+        tools: list[dict[str, Any]] = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Gets weather.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                        },
+                    },
+                },
+            },
+        ]
+        original_tools = copy.deepcopy(tools)
+
+        mistral_tokenizer.apply_chat_template(
+            messages, tools=tools, add_generation_prompt=True
+        )
+
+        assert tools == original_tools
+
+    @pytest.mark.parametrize(
+        "reasoning_key",
+        ["reasoning", "reasoning_content"],
+    )
+    def test_apply_chat_template_reasoning_assistant(
+        self, mistral_tokenizer: MistralTokenizer, reasoning_key: str
+    ) -> None:
+        if not mistral_tokenizer.is_tekken:
+            pytest.skip("Reasoning tokens only supported on tekken tokenizers")
+
+        messages: list[dict[str, Any]] = [
+            {"role": "user", "content": "What is 2+2?"},
+            {
+                "role": "assistant",
+                "content": "4",
+                reasoning_key: "2+2 equals 4",
+            },
+            {"role": "user", "content": "Are you sure?"},
+        ]
+
+        output = mistral_tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True
+        )
+        decoded = mistral_tokenizer.tokenizer.decode(output, SpecialTokenPolicy.KEEP)
+
+        assert "[THINK]2+2 equals 4[/THINK]" in decoded
diff --git a/tests/tool_parsers/test_deepseekv32_tool_parser.py b/tests/tool_parsers/test_deepseekv32_tool_parser.py
index c547795e7bf2..693cf5caddd5 100644
--- a/tests/tool_parsers/test_deepseekv32_tool_parser.py
+++ b/tests/tool_parsers/test_deepseekv32_tool_parser.py
@@ -203,7 +203,14 @@ def test_type_conversion_in_non_streaming(self):
             ),
         )
         parser = make_parser(tools=[tool])
-        model_output = build_tool_call("toggle", {"enabled": "true", "count": "42"})
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}toggle">\n'
+            f'{PARAM_START}enabled" string="false">true{PARAM_END}\n'
+            f'{PARAM_START}count" string="false">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
         result = parser.extract_tool_calls(model_output, None)
         assert result.tools_called
         assert len(result.tool_calls) == 1
@@ -212,6 +219,118 @@ def test_type_conversion_in_non_streaming(self):
         assert isinstance(args["enabled"], bool)
         assert isinstance(args["count"], int)
 
+    def test_string_attr_true_preserves_literal_despite_schema(self):
+        """string="true" must keep the value as a string even
+        if the schema says integer."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="true">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"value": "42"}
+        assert isinstance(args["value"], str)
+
+    def test_string_attr_false_allows_schema_conversion(self):
+        """string="false" allows the parser to convert via the tool schema."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="false">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"value": 42}
+        assert isinstance(args["value"], int)
+
+    def test_arguments_wrapper_repaired(self):
+        """A single 'arguments' wrapper parameter must be unwrapped when it
+        is not part of the tool schema and the inner object matches schema fields."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="get_weather",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}get_weather">\n'
+            f'{PARAM_START}arguments" string="false">'
+            f'{{"location":"Beijing"}}'
+            f"{PARAM_END}\n"
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"location": "Beijing"}
+
+    def test_input_wrapper_repaired(self):
+        """A single 'input' wrapper parameter must be unwrapped similarly."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="get_weather",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "location": {"type": "string"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        model_output = (
+            f"{FC_START}\n"
+            f'{INV_START}get_weather">\n'
+            f'{PARAM_START}input" string="true">'
+            f'{{"location":"Beijing"}}'
+            f"{PARAM_END}\n"
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        result = parser.extract_tool_calls(model_output, None)
+        assert result.tools_called
+        args = json.loads(result.tool_calls[0].function.arguments)
+        assert args == {"location": "Beijing"}
+
 
 # ---------------------------------------------------------------------------
 # Tests: extract_tool_calls_streaming
@@ -319,11 +438,45 @@ def test_type_conversion_in_streaming(self):
             ),
         )
         parser = make_parser(tools=[tool])
-        full_text = build_tool_call("add", {"x": "3", "y": "4"})
+        full_text = (
+            f"{FC_START}\n"
+            f'{INV_START}add">\n'
+            f'{PARAM_START}x" string="false">3{PARAM_END}\n'
+            f'{PARAM_START}y" string="false">4{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
         deltas = self._stream(parser, full_text)
         args_str = self._reconstruct_args(deltas)
         assert json.loads(args_str) == {"x": 3, "y": 4}
 
+    def test_string_attr_true_preserves_literal_in_streaming(self):
+        """Streaming: string='true' must keep the value literal despite schema."""
+        tool = ChatCompletionToolsParam(
+            function=FunctionDefinition(
+                name="score",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "value": {"type": "integer"},
+                    },
+                },
+            ),
+        )
+        parser = make_parser(tools=[tool])
+        full_text = (
+            f"{FC_START}\n"
+            f'{INV_START}score">\n'
+            f'{PARAM_START}value" string="true">42{PARAM_END}\n'
+            f"{INV_END}\n"
+            f"{FC_END}"
+        )
+        deltas = self._stream(parser, full_text)
+        args_str = self._reconstruct_args(deltas)
+        args = json.loads(args_str)
+        assert args == {"value": "42"}
+        assert isinstance(args["value"], str)
+
     def test_multiple_tools_streaming(self, parser):
         full_text = (
             f"{FC_START}\n"
diff --git a/tests/tool_parsers/test_deepseekv4_tool_parser.py b/tests/tool_parsers/test_deepseekv4_tool_parser.py
index 631d0fb97b33..afcd0573958b 100644
--- a/tests/tool_parsers/test_deepseekv4_tool_parser.py
+++ b/tests/tool_parsers/test_deepseekv4_tool_parser.py
@@ -6,6 +6,15 @@
 import json
 from unittest.mock import MagicMock
 
+import pytest
+from xgrammar import StructuralTag
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedFunction,
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest,
+    ChatCompletionToolsParam,
+)
 from vllm.tool_parsers import ToolParserManager
 from vllm.tool_parsers.deepseekv4_tool_parser import DeepSeekV4ToolParser
 
@@ -20,6 +29,43 @@
 PARAM_END = "</｜DSML｜parameter>"
 
 
+@pytest.fixture
+def sample_tools() -> list[ChatCompletionToolsParam]:
+    return [
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "get_current_weather",
+                "description": "Get the current weather",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {"type": "string", "description": "The city name"},
+                        "state": {"type": "string", "description": "The state code"},
+                        "unit": {"type": "string", "enum": ["fahrenheit", "celsius"]},
+                    },
+                    "required": ["city", "state"],
+                },
+            },
+        ),
+        ChatCompletionToolsParam(
+            type="function",
+            function={
+                "name": "calculate_area",
+                "description": "Calculate area of a shape",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "shape": {"type": "string"},
+                        "dimensions": {"type": "object"},
+                        "precision": {"type": "integer"},
+                    },
+                },
+            },
+        ),
+    ]
+
+
 def make_parser(tools=None) -> DeepSeekV4ToolParser:
     return DeepSeekV4ToolParser(MOCK_TOKENIZER, tools=tools)
 
@@ -121,3 +167,72 @@ def test_streaming_extracts_complete_invokes():
     ]
     assert names == ["search"]
     assert json.loads(reconstruct_args(deltas)) == {"query": "deepseek v4"}
+
+
+def test_get_vllm_registry_structural_tag_returns_structural_tag(
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    parser = make_parser()
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=sample_tools,
+        tool_choice="auto",
+    )
+    tag = parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=sample_tools,
+        tool_choice="required",
+    )
+    tag = parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    if sample_tools:
+        tool = sample_tools[0]
+        req = ChatCompletionRequest(
+            messages=[],
+            model="m",
+            tools=sample_tools,
+        )
+        req.tool_choice = ChatCompletionNamedToolChoiceParam(
+            function=ChatCompletionNamedFunction(name=tool.function.name)
+        )
+        tag = parser.get_structural_tag(req)
+        assert isinstance(tag, StructuralTag)
+
+
+def test_extract_tool_calls_arguments_wrapper():
+    mock_tokenizer = MagicMock()
+    mock_tokenizer.get_vocab.return_value = {}
+
+    tool = ChatCompletionToolsParam(
+        type="function",
+        function={
+            "name": "get_weather",
+            "parameters": {
+                "type": "object",
+                "properties": {"location": {"type": "string"}},
+            },
+        },
+    )
+
+    parser = DeepSeekV4ToolParser(mock_tokenizer, tools=[tool])
+    request = MagicMock()
+    request.tools = [tool]
+
+    model_output = (
+        f"{TC_START}"
+        f'{INV_START}get_weather">'
+        f'{PARAM_START}arguments" string="false">{{"location":"Beijing"}}{PARAM_END}'
+        f"{INV_END}"
+        f"{TC_END}"
+    )
+
+    result = parser.extract_tool_calls(model_output, request)
+    assert result.tools_called
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args == {"location": "Beijing"}
diff --git a/tests/tool_parsers/test_lfm2_tool_parser.py b/tests/tool_parsers/test_lfm2_tool_parser.py
new file mode 100644
index 000000000000..9cb5b195f1a7
--- /dev/null
+++ b/tests/tool_parsers/test_lfm2_tool_parser.py
@@ -0,0 +1,468 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.tool_parsers.utils import (
+    run_tool_extraction,
+    run_tool_extraction_streaming,
+)
+from vllm.entrypoints.openai.engine.protocol import FunctionCall
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers import ToolParser, ToolParserManager
+
+TOOL_CALL_START = "<|tool_call_start|>"
+TOOL_CALL_END = "<|tool_call_end|>"
+
+SIMPLE_FUNCTION_OUTPUT = "get_candidate_status(candidate_id='12345')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_candidate_status",
+    arguments='{"candidate_id": "12345"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])"
+)
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments="{}",
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')"
+)
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+DOTTED_NAME_FUNCTION_OUTPUT = (
+    "grocery.orderIngredients("
+    "ingredientList=[{'name': 'Lasagna noodles', 'amount': 250, 'unit': 'g'}], "
+    "deliveryAddress='845 Willow Lane, Springfield, IL 62704')"
+)
+DOTTED_NAME_FUNCTION_CALL = FunctionCall(
+    name="grocery.orderIngredients",
+    arguments=(
+        '{"ingredientList": ['
+        '{"name": "Lasagna noodles", "amount": 250, "unit": "g"}], '
+        '"deliveryAddress": "845 Willow Lane, Springfield, IL 62704"}'
+    ),
+)
+
+
+@pytest.fixture(scope="module")
+def lfm2_tokenizer() -> TokenizerLike:
+    return AutoTokenizer.from_pretrained("LiquidAI/LFM2.5-1.2B-Instruct")
+
+
+def _wrap(tool_text: str, content_after: str = "") -> str:
+    """Wrap pythonic tool call in LFM2.5 sentinel tokens."""
+    result = f"{TOOL_CALL_START}[{tool_text}]{TOOL_CALL_END}"
+    if content_after:
+        result += f"\n{content_after}"
+    return result
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool, lfm2_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(
+        True,
+        _wrap(SIMPLE_FUNCTION_OUTPUT),
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(SIMPLE_FUNCTION_OUTPUT),
+        [SIMPLE_FUNCTION_CALL],
+        None,
+        id="simple_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(MORE_TYPES_FUNCTION_OUTPUT),
+        [MORE_TYPES_FUNCTION_CALL],
+        None,
+        id="more_types_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(MORE_TYPES_FUNCTION_OUTPUT),
+        [MORE_TYPES_FUNCTION_CALL],
+        None,
+        id="more_types_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(PARAMETERLESS_FUNCTION_OUTPUT),
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(PARAMETERLESS_FUNCTION_OUTPUT),
+        [PARAMETERLESS_FUNCTION_CALL],
+        None,
+        id="parameterless_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(EMPTY_DICT_FUNCTION_OUTPUT),
+        [EMPTY_DICT_FUNCTION_CALL],
+        None,
+        id="empty_dict_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(EMPTY_DICT_FUNCTION_OUTPUT),
+        [EMPTY_DICT_FUNCTION_CALL],
+        None,
+        id="empty_dict_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(EMPTY_LIST_FUNCTION_OUTPUT),
+        [EMPTY_LIST_FUNCTION_CALL],
+        None,
+        id="empty_list_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(EMPTY_LIST_FUNCTION_OUTPUT),
+        [EMPTY_LIST_FUNCTION_CALL],
+        None,
+        id="empty_list_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(ESCAPED_STRING_FUNCTION_OUTPUT),
+        [ESCAPED_STRING_FUNCTION_CALL],
+        None,
+        id="escaped_string_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(ESCAPED_STRING_FUNCTION_OUTPUT),
+        [ESCAPED_STRING_FUNCTION_CALL],
+        None,
+        id="escaped_string_nonstreaming",
+    ),
+    pytest.param(
+        True,
+        _wrap(f"{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}"),
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        None,
+        id="parallel_calls_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(f"{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}"),
+        [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+        None,
+        id="parallel_calls_nonstreaming",
+    ),
+    # LFM2.5 specific: content AFTER tool call
+    pytest.param(
+        False,
+        _wrap(
+            SIMPLE_FUNCTION_OUTPUT,
+            content_after="Checking the current status of candidate ID 12345.",
+        ),
+        [SIMPLE_FUNCTION_CALL],
+        "Checking the current status of candidate ID 12345.",
+        id="content_after_tool_call_nonstreaming",
+    ),
+    # Dotted / class-method function names: grocery.orderIngredients(...)
+    pytest.param(
+        True,
+        _wrap(DOTTED_NAME_FUNCTION_OUTPUT),
+        [DOTTED_NAME_FUNCTION_CALL],
+        None,
+        id="dotted_name_streaming",
+    ),
+    pytest.param(
+        False,
+        _wrap(DOTTED_NAME_FUNCTION_OUTPUT),
+        [DOTTED_NAME_FUNCTION_CALL],
+        None,
+        id="dotted_name_nonstreaming",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "streaming, model_output, expected_tool_calls, expected_content",
+    TEST_CASES,
+)
+def test_tool_call(
+    streaming: bool,
+    model_output: str,
+    expected_tool_calls: list[FunctionCall],
+    expected_content: str | None,
+    lfm2_tokenizer: TokenizerLike,
+):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+
+    content, tool_calls = run_tool_extraction(
+        tool_parser, model_output, streaming=streaming
+    )
+
+    if expected_content and not streaming:
+        assert content == expected_content
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps(lfm2_tokenizer: TokenizerLike):
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    model_output_deltas = [
+        f"{TOOL_CALL_START}[get_candidate_status(candidate_id='12345'), "
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}]{TOOL_CALL_END}",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False
+    )
+
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
+
+
+def test_streaming_full_block_and_trailing_in_single_delta(
+    lfm2_tokenizer: TokenizerLike,
+):
+    """The entire <|tool_call_start|>[...]<|tool_call_end|> block plus
+    trailing assistant text arrive in one delta. Trailing content must
+    still be emitted — not silently dropped."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    full_text = f"{TOOL_CALL_START}[{SIMPLE_FUNCTION_OUTPUT}]{TOOL_CALL_END}\nDone."
+
+    reconstructor = run_tool_extraction_streaming(tool_parser, [full_text])
+
+    assert len(reconstructor.tool_calls) == 1
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert "Done." in reconstructor.other_content
+
+
+def test_streaming_leading_content_and_full_block_in_single_delta(
+    lfm2_tokenizer: TokenizerLike,
+):
+    """Leading assistant text plus the entire tool block arrive in one
+    delta. Leading content must be emitted — not silently dropped."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    full_text = (
+        f"Let me check. {TOOL_CALL_START}[{SIMPLE_FUNCTION_OUTPUT}]{TOOL_CALL_END}"
+    )
+
+    reconstructor = run_tool_extraction_streaming(tool_parser, [full_text])
+
+    assert len(reconstructor.tool_calls) == 1
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert "Let me check." in reconstructor.other_content
+
+
+def test_streaming_leading_block_and_trailing_in_single_delta(
+    lfm2_tokenizer: TokenizerLike,
+):
+    """Leading text + complete tool block + trailing text in one delta.
+    Both leading and trailing content must be preserved."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    full_text = (
+        "Let me check. "
+        f"{TOOL_CALL_START}[{SIMPLE_FUNCTION_OUTPUT}]{TOOL_CALL_END}\nDone."
+    )
+
+    reconstructor = run_tool_extraction_streaming(tool_parser, [full_text])
+
+    assert len(reconstructor.tool_calls) == 1
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert "Let me check." in reconstructor.other_content
+    assert "Done." in reconstructor.other_content
+
+
+def test_echoed_tool_call_body_not_leaked_to_content(
+    lfm2_tokenizer: TokenizerLike,
+):
+    """LFM2 sometimes emits the tool call body again after the first
+    <|tool_call_end|>, capped with a second <|tool_call_end|>. The
+    echoed body must not surface as assistant content — neither in
+    streaming nor non-streaming paths."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    body = (
+        "[grocery.orderIngredients("
+        "ingredientList=[{'name': 'apple', 'quantity': '2'}], "
+        "deliveryAddress='123 Main St')]"
+    )
+    model_output = f"{TOOL_CALL_START}{body}{TOOL_CALL_END}{body}{TOOL_CALL_END}"
+
+    # Non-streaming
+    content_ns, tool_calls_ns = run_tool_extraction(
+        tool_parser, model_output, streaming=False
+    )
+    assert len(tool_calls_ns) == 1
+    assert tool_calls_ns[0].function.name == "grocery.orderIngredients"
+    assert content_ns in (None, "")
+
+    # Streaming: re-fetch a fresh parser since state was mutated above.
+    tool_parser2: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    content_s, tool_calls_s = run_tool_extraction(
+        tool_parser2, model_output, streaming=True
+    )
+    assert len(tool_calls_s) == 1
+    assert tool_calls_s[0].function.name == "grocery.orderIngredients"
+    # Echoed body must not leak as content.
+    assert content_s in (None, "")
+    assert "grocery.orderIngredients" not in (content_s or "")
+    assert TOOL_CALL_END not in (content_s or "")
+
+
+def test_streaming_char_by_char_multi_dict_list(lfm2_tokenizer: TokenizerLike):
+    """Stream a tool call containing a list of multiple dicts one
+    character at a time. Every prefix lands in some partial-parse state
+    (mid-key, mid-value, open quote inside dict, empty dict, etc.). The
+    parser must not raise — incomplete prefixes should silently wait for
+    more text instead of logging exceptions."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    full_text = (
+        f"{TOOL_CALL_START}[grocery.orderIngredients("
+        "ingredientList=["
+        '{"name": "apple", "quantity": "2"}, '
+        '{"name": "bread", "quantity": "1"}'
+        f"])]{TOOL_CALL_END}"
+    )
+    deltas = [c for c in full_text]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, deltas, assert_one_tool_per_delta=False
+    )
+
+    assert len(reconstructor.tool_calls) == 1
+    assert reconstructor.tool_calls[0].function.name == "grocery.orderIngredients"
+    import json
+
+    args = json.loads(reconstructor.tool_calls[0].function.arguments)
+    assert args == {
+        "ingredientList": [
+            {"name": "apple", "quantity": "2"},
+            {"name": "bread", "quantity": "1"},
+        ]
+    }
+
+
+def test_streaming_dotted_name_in_single_delta(lfm2_tokenizer: TokenizerLike):
+    """A pythonic call with a dotted/attribute function name (e.g.
+    ``domain.method(arg=...)``) must be parsed correctly in streaming mode
+    just as in non-streaming mode."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+    full_text = f"{TOOL_CALL_START}[{DOTTED_NAME_FUNCTION_OUTPUT}]{TOOL_CALL_END}"
+
+    reconstructor = run_tool_extraction_streaming(tool_parser, [full_text])
+
+    assert len(reconstructor.tool_calls) == 1
+    assert reconstructor.tool_calls[0].function == DOTTED_NAME_FUNCTION_CALL
+
+
+def test_adjust_request_disables_skip_special_tokens(
+    lfm2_tokenizer: TokenizerLike,
+):
+    """When tools are present, the parser must force
+    ``skip_special_tokens=False`` so the engine does not strip the
+    <|tool_call_start|>/<|tool_call_end|> sentinels before they reach the
+    parser."""
+    from vllm.entrypoints.openai.chat_completion.protocol import (
+        ChatCompletionRequest,
+    )
+
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+
+    request_with_tools = ChatCompletionRequest(
+        messages=[],
+        model="test-model",
+        tools=[
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                    },
+                },
+            }
+        ],
+    )
+    assert request_with_tools.skip_special_tokens is True
+    adjusted = tool_parser.adjust_request(request_with_tools)
+    assert adjusted.skip_special_tokens is False
+
+    # No tools → no override; default behaviour preserved.
+    request_no_tools = ChatCompletionRequest(messages=[], model="test-model")
+    assert request_no_tools.skip_special_tokens is True
+    adjusted_no_tools = tool_parser.adjust_request(request_no_tools)
+    assert adjusted_no_tools.skip_special_tokens is True
+
+
+@pytest.mark.parametrize("streaming", [False])
+def test_regex_timeout_handling(streaming: bool, lfm2_tokenizer: TokenizerLike):
+    """Test regex timeout is handled gracefully."""
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("lfm2")(lfm2_tokenizer)
+
+    fake_input = f"{TOOL_CALL_START}[A(A=" + "\t)A(A=,\t" * 2
+    fake_input += f"]{TOOL_CALL_END}"
+
+    mock_regex = MagicMock()
+    mock_regex.match.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(tool_parser, "TOOL_CALL_REGEX", mock_regex):
+        content, tool_calls = run_tool_extraction(
+            tool_parser, fake_input, streaming=streaming
+        )
+
+        assert content == fake_input
+        assert len(tool_calls) == 0
+        mock_regex.match.assert_called_once()
diff --git a/tests/tool_parsers/test_mistral_tool_parser.py b/tests/tool_parsers/test_mistral_tool_parser.py
index 42e8cf138b97..f6a5c6bfb265 100644
--- a/tests/tool_parsers/test_mistral_tool_parser.py
+++ b/tests/tool_parsers/test_mistral_tool_parser.py
@@ -590,6 +590,33 @@ def _test_extract_tool_calls_streaming(
     ]
     assert_tool_calls(actual_tool_calls, expected_tool_calls)
 
+    if expected_tool_calls:
+        assert len(tool_parser.streamed_args_for_tool) == len(expected_tool_calls)
+        assert len(tool_parser.prev_tool_call_arr) == len(expected_tool_calls)
+        for i in range(len(expected_tool_calls)):
+            assert (
+                tool_parser.prev_tool_call_arr[i]["arguments"]
+                == tool_parser.streamed_args_for_tool[i]
+            )
+            assert tool_parser.streamed_args_for_tool[i] == function_args_strs[i]
+            assert (
+                tool_parser.prev_tool_call_arr[i]["name"]
+                == expected_tool_calls[i].function.name
+            )
+
+        # Simulate the serving layer's unstreamed-args check
+        index = len(tool_parser.prev_tool_call_arr) - 1
+        args = tool_parser.prev_tool_call_arr[index].get("arguments", {})
+        expected_call = (
+            args if isinstance(args, str) else json.dumps(args, ensure_ascii=False)
+        )
+        actual_call = tool_parser.streamed_args_for_tool[index]
+        remaining_call = expected_call.replace(actual_call, "", 1)
+        assert remaining_call == ""
+    else:
+        assert len(tool_parser.streamed_args_for_tool) == 0
+        assert len(tool_parser.prev_tool_call_arr) == 0
+
 
 @pytest.mark.parametrize(
     ids=[
@@ -855,6 +882,8 @@ def test_extract_tool_calls_streaming_v11_no_tools(
         previous_text = current_text
 
     assert collected_content == model_output
+    assert len(mistral_tool_parser.streamed_args_for_tool) == 0
+    assert len(mistral_tool_parser.prev_tool_call_arr) == 0
 
 
 @pytest.mark.parametrize(
diff --git a/tests/tool_parsers/test_qwen3coder_tool_parser.py b/tests/tool_parsers/test_qwen3coder_tool_parser.py
index c62e95830243..26bbf1a044bc 100644
--- a/tests/tool_parsers/test_qwen3coder_tool_parser.py
+++ b/tests/tool_parsers/test_qwen3coder_tool_parser.py
@@ -6,8 +6,11 @@
 
 import pytest
 from openai.types.responses.function_tool import FunctionTool
+from xgrammar import StructuralTag
 
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedFunction,
+    ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionToolsParam,
 )
@@ -108,6 +111,27 @@ def sample_tools(request):
         ]
 
 
+def _as_chat_completion_tools(
+    tools: list[ChatCompletionToolsParam | FunctionTool],
+) -> list[ChatCompletionToolsParam]:
+    normalized: list[ChatCompletionToolsParam] = []
+    for tool in tools:
+        if isinstance(tool, ChatCompletionToolsParam):
+            normalized.append(tool)
+        else:
+            normalized.append(
+                ChatCompletionToolsParam(
+                    type="function",
+                    function={
+                        "name": tool.name,
+                        "description": tool.description,
+                        "parameters": tool.parameters,
+                    },
+                )
+            )
+    return normalized
+
+
 def assert_tool_calls(
     actual_tool_calls: list[ToolCall], expected_tool_calls: list[ToolCall]
 ):
@@ -1146,3 +1170,88 @@ def test_no_double_serialization_string_args(qwen3_tool_parser):
     args = json.loads(raw_arguments)
     assert args["message"] == "hello world"
     assert '\\"hello world\\"' not in raw_arguments
+
+
+def test_get_vllm_registry_structural_tag_returns_structural_tag(
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="auto",
+    )
+    tag = qwen3_tool_parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="required",
+    )
+    tag = qwen3_tool_parser.get_structural_tag(req)
+    assert isinstance(tag, StructuralTag)
+
+    if request_tools:
+        tool = request_tools[0]
+        req = ChatCompletionRequest(
+            messages=[],
+            model="m",
+            tools=request_tools,
+        )
+        req.tool_choice = ChatCompletionNamedToolChoiceParam(
+            function=ChatCompletionNamedFunction(name=tool.function.name)
+        )
+        tag = qwen3_tool_parser.get_structural_tag(req)
+        assert isinstance(tag, StructuralTag)
+
+
+@pytest.mark.parametrize("include_reasoning", [True, False])
+def test_adjust_request_auto_uses_vllm_registry_structural_tag(
+    monkeypatch: pytest.MonkeyPatch,
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+    include_reasoning: bool,
+) -> None:
+    monkeypatch.setattr(
+        "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+        True,
+    )
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="auto",
+        include_reasoning=include_reasoning,
+    )
+    out = qwen3_tool_parser.adjust_request(req)
+    assert out.structured_outputs is not None
+    assert out.structured_outputs.structural_tag is not None
+    assert isinstance(out.structured_outputs.structural_tag, str)
+    loaded = json.loads(out.structured_outputs.structural_tag)
+    assert isinstance(loaded, dict)
+
+
+def test_adjust_request_required_prefers_structural_tag(
+    monkeypatch: pytest.MonkeyPatch,
+    qwen3_tool_parser: Qwen3CoderToolParser,
+    sample_tools: list[ChatCompletionToolsParam],
+) -> None:
+    monkeypatch.setattr(
+        "vllm.tool_parsers.abstract_tool_parser.VLLM_ENFORCE_STRICT_TOOL_CALLING",
+        True,
+    )
+    request_tools = _as_chat_completion_tools(sample_tools)
+    req = ChatCompletionRequest(
+        messages=[],
+        model="m",
+        tools=request_tools,
+        tool_choice="required",
+    )
+    out = qwen3_tool_parser.adjust_request(req)
+    assert out.structured_outputs is not None
+    assert out.structured_outputs.structural_tag is not None
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 01c1360818eb..e99165f3569a 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
 from copy import deepcopy
-from unittest.mock import MagicMock
 
 import pytest
 import regex as re
@@ -11,7 +10,7 @@
 from vllm.entrypoints.openai.chat_completion.protocol import (
     ChatCompletionToolsParam,
 )
-from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.tool_parsers.streaming import extract_required_tool_call_streaming
 from vllm.tool_parsers.utils import get_json_schema_from_tools
 
 pytestmark = pytest.mark.cpu_test
@@ -281,8 +280,6 @@ def test_structured_outputs_json_without_parameters(
 @pytest.mark.parametrize("empty_params", [False, True])
 @pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 def test_streaming_output_valid(output, empty_params, delta_len):
-    self = MagicMock()
-
     output = deepcopy(output)
     if empty_params:
         output = [{"name": o["name"], "parameters": {}} for o in output]
@@ -295,14 +292,13 @@ def test_streaming_output_valid(output, empty_params, delta_len):
         delta_text = output_json[i : i + delta_len]
         current_text = previous_text + delta_text
 
-        delta_message, function_name_returned = (
-            OpenAIServingChat.extract_tool_call_required_streaming(
-                self,
-                previous_text=previous_text,
-                current_text=current_text,
-                delta_text=delta_text,
-                function_name_returned=function_name_returned,
-            )
+        delta_message, function_name_returned = extract_required_tool_call_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            function_name_returned=function_name_returned,
+            tool_call_idx=None,
+            tool_call_id_type="random",
         )
 
         if delta_message:
@@ -332,8 +328,6 @@ def test_streaming_output_valid(output, empty_params, delta_len):
 
 
 def test_streaming_output_valid_with_trailing_extra_data():
-    self = MagicMock()
-
     output = [{"name": "get_current_weather", "parameters": {"city": "Vienna"}}]
     output_json = json.dumps(output) + "\nDONE"
 
@@ -345,14 +339,13 @@ def test_streaming_output_valid_with_trailing_extra_data():
         delta_text = output_json[i : i + delta_len]
         current_text = previous_text + delta_text
 
-        delta_message, function_name_returned = (
-            OpenAIServingChat.extract_tool_call_required_streaming(
-                self,
-                previous_text=previous_text,
-                current_text=current_text,
-                delta_text=delta_text,
-                function_name_returned=function_name_returned,
-            )
+        delta_message, function_name_returned = extract_required_tool_call_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            function_name_returned=function_name_returned,
+            tool_call_idx=None,
+            tool_call_id_type="random",
         )
 
         if delta_message:
diff --git a/tests/utils.py b/tests/utils.py
index e4b6a6ff6e70..0ded7e1d696e 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import atexit
 import contextlib
 import copy
 import functools
@@ -18,7 +19,7 @@
 import time
 import warnings
 from collections.abc import Callable, Iterable, Sequence
-from contextlib import ExitStack, contextmanager, suppress
+from contextlib import ExitStack, contextmanager
 from multiprocessing import Process
 from pathlib import Path
 from typing import Any, Literal
@@ -134,6 +135,11 @@ class RemoteVLLMServer:
     """
 
     DUMMY_API_KEY = "token-abc123"  # vLLM's OpenAI server does not need API key
+    _active_servers: set["RemoteVLLMServer"] = set()
+    _active_servers_lock = threading.RLock()
+    _cleanup_hooks_registered = False
+    _signal_hooks_registered = False
+    _previous_signal_handlers: dict[int, Any] = {}
     proc: subprocess.Popen
 
     def _create_cli_subcommand(self):
@@ -209,6 +215,7 @@ def __init__(
         )
 
         self._pre_download_model(model, args)
+        self._shutdown_complete = False
 
         # Record GPU memory before server start so we know what
         # "released" looks like.
@@ -221,6 +228,7 @@ def __init__(
             )
 
         self._start_server(model, vllm_serve_args, env_dict)
+        self._register_active_server()
         max_wait_seconds = max_wait_seconds or 480
         try:
             self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)
@@ -246,8 +254,70 @@ def _shutdown(self) -> None:
         (when the server fails to start). Must be safe to call even if
         the process is already dead.
         """
-        self._terminate_process_tree()
-        self._wait_for_gpu_memory_release()
+        if self._shutdown_complete:
+            return
+
+        self._shutdown_complete = True
+        try:
+            self._terminate_process_tree()
+            self._wait_for_gpu_memory_release()
+        finally:
+            self._unregister_active_server()
+
+    @classmethod
+    def _ensure_cleanup_hooks_registered(cls) -> None:
+        """Register process-exit cleanup for detached server subprocesses."""
+        root_cls = RemoteVLLMServer
+        with root_cls._active_servers_lock:
+            if not root_cls._cleanup_hooks_registered:
+                atexit.register(root_cls._shutdown_active_servers)
+                root_cls._cleanup_hooks_registered = True
+
+            if (
+                threading.current_thread() is threading.main_thread()
+                and not root_cls._signal_hooks_registered
+            ):
+                for signum in (signal.SIGTERM, signal.SIGINT):
+                    root_cls._previous_signal_handlers[signum] = signal.getsignal(
+                        signum
+                    )
+                    signal.signal(signum, root_cls._handle_parent_signal)
+                root_cls._signal_hooks_registered = True
+
+    def _register_active_server(self) -> None:
+        """Track this server so parent-process exits still clean it up."""
+        RemoteVLLMServer._ensure_cleanup_hooks_registered()
+        with RemoteVLLMServer._active_servers_lock:
+            RemoteVLLMServer._active_servers.add(self)
+
+    def _unregister_active_server(self) -> None:
+        with RemoteVLLMServer._active_servers_lock:
+            RemoteVLLMServer._active_servers.discard(self)
+
+    @classmethod
+    def _shutdown_active_servers(cls) -> None:
+        """Best-effort shutdown for all live RemoteVLLMServer instances."""
+        with cls._active_servers_lock:
+            servers = list(cls._active_servers)
+
+        for server in servers:
+            with contextlib.suppress(Exception):
+                server._shutdown()
+
+    @classmethod
+    def _handle_parent_signal(cls, signum, frame) -> None:
+        """Clean up detached servers before letting the signal terminate pytest."""
+        cls._shutdown_active_servers()
+
+        previous_handler = cls._previous_signal_handlers.get(signum, signal.SIG_DFL)
+        if callable(previous_handler):
+            previous_handler(signum, frame)
+        elif previous_handler == signal.SIG_IGN:
+            return
+        elif signum == signal.SIGINT:
+            raise KeyboardInterrupt
+        else:
+            raise SystemExit(128 + signum)
 
     def _terminate_process_tree(self) -> None:
         """Kill the server process tree without waiting for GPU memory release.
@@ -315,6 +385,9 @@ def shutdown_many(cls, servers: Sequence["RemoteVLLMServer"]) -> None:
         if not servers:
             return
 
+        for server in servers:
+            server._shutdown_complete = True
+
         threads = [
             threading.Thread(
                 target=s._terminate_process_tree,
@@ -339,7 +412,11 @@ def shutdown_many(cls, servers: Sequence["RemoteVLLMServer"]) -> None:
                 else s._pre_server_gpu_memory
             ),
         )
-        earliest._wait_for_gpu_memory_release()
+        try:
+            earliest._wait_for_gpu_memory_release()
+        finally:
+            for server in servers:
+                server._unregister_active_server()
 
     def _kill_process_group_survivors(
         self, pgid: int | None, timeout: float = 15.0
@@ -705,6 +782,7 @@ def _test_completion(
     model: str,
     prompt: str,
     token_ids: list[int],
+    include_seeded_sampling: bool = True,
 ):
     results = []
 
@@ -739,33 +817,40 @@ def _test_completion(
         }
     )
 
-    # test seeded random sampling
-    completion = client.completions.create(
-        model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
-    )
+    if include_seeded_sampling:
+        # test seeded random sampling
+        completion = client.completions.create(
+            model=model, prompt=prompt, max_tokens=5, seed=33, temperature=1.0
+        )
 
-    results.append(
-        {
-            "test": "seeded_sampling",
-            "text": completion.choices[0].text,
-            "finish_reason": completion.choices[0].finish_reason,
-            "usage": completion.usage,
-        }
-    )
+        results.append(
+            {
+                "test": "seeded_sampling",
+                "text": completion.choices[0].text,
+                "finish_reason": completion.choices[0].finish_reason,
+                "usage": completion.usage,
+            }
+        )
 
-    # test seeded random sampling with multiple prompts
-    completion = client.completions.create(
-        model=model, prompt=[prompt, prompt], max_tokens=5, seed=33, temperature=1.0
-    )
+        # test seeded random sampling with multiple prompts
+        completion = client.completions.create(
+            model=model,
+            prompt=[prompt, prompt],
+            max_tokens=5,
+            seed=33,
+            temperature=1.0,
+        )
 
-    results.append(
-        {
-            "test": "seeded_sampling",
-            "text": [choice.text for choice in completion.choices],
-            "finish_reason": [choice.finish_reason for choice in completion.choices],
-            "usage": completion.usage,
-        }
-    )
+        results.append(
+            {
+                "test": "seeded_sampling",
+                "text": [choice.text for choice in completion.choices],
+                "finish_reason": [
+                    choice.finish_reason for choice in completion.choices
+                ],
+                "usage": completion.usage,
+            }
+        )
 
     # test simple list
     batch = client.completions.create(
@@ -960,6 +1045,7 @@ def compare_two_settings(
     *,
     method: str = "generate",
     max_wait_seconds: float | None = None,
+    include_seeded_sampling: bool = True,
 ) -> None:
     """
     Launch API server with two different sets of arguments/environments
@@ -971,6 +1057,8 @@ def compare_two_settings(
         arg2: The second set of arguments to pass to the API server.
         env1: The first set of environment variables to pass to the API server.
         env2: The second set of environment variables to pass to the API server.
+        include_seeded_sampling: Whether to include temperature=1.0 seeded
+            sampling checks in the default generate comparison.
     """
 
     compare_all_settings(
@@ -979,6 +1067,7 @@ def compare_two_settings(
         [env1, env2],
         method=method,
         max_wait_seconds=max_wait_seconds,
+        include_seeded_sampling=include_seeded_sampling,
     )
 
 
@@ -989,6 +1078,7 @@ def compare_all_settings(
     *,
     method: str = "generate",
     max_wait_seconds: float | None = None,
+    include_seeded_sampling: bool = True,
 ) -> None:
     """
     Launch API server with several different sets of arguments/environments
@@ -997,6 +1087,8 @@ def compare_all_settings(
         model: The model to test.
         all_args: A list of argument lists to pass to the API server.
         all_envs: A list of environment dictionaries to pass to the API server.
+        include_seeded_sampling: Whether to include temperature=1.0 seeded
+            sampling checks in the default generate comparison.
     """
 
     trust_remote_code = False
@@ -1057,7 +1149,13 @@ def compare_all_settings(
             )
 
             if method == "generate":
-                results += _test_completion(client, model, prompt, token_ids)
+                results += _test_completion(
+                    client,
+                    model,
+                    prompt,
+                    token_ids,
+                    include_seeded_sampling=include_seeded_sampling,
+                )
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
             elif method == "generate_chat":
@@ -1413,53 +1511,110 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
     return wrapper
 
 
+def _format_subprocess_exit(returncode: int) -> str:
+    """Render a subprocess exit code, naming the signal for negative codes."""
+    if returncode >= 0:
+        return f"exit code {returncode}"
+    try:
+        return f"killed by {signal.Signals(-returncode).name} ({returncode})"
+    except ValueError:
+        return f"exit code {returncode}"
+
+
+# Set on the spawn-child interpreter so the wrapper short-circuits when the
+# child resolves `module.qualname` back to its own decorated form, instead of
+# launching another subprocess.
+_SPAWN_CHILD_ENV = "VLLM_TEST_SPAWN_CHILD"
+
+
 def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]:
-    """Decorator to spawn a new process for each test function."""
+    """Decorator to spawn a new process for each test function.
+
+    Uses subprocess to run each test in a fresh interpreter and propagates
+    exceptions back to the parent, so test failures are never silently
+    swallowed (fixes https://github.com/vllm-project/vllm/issues/41415).
+
+    The child resolves the test function by importing its module and looking
+    it up by qualified name, rather than reconstructing it from a cloudpickle
+    blob. Pickling the function by value would also pickle its ``__globals__``
+    by value — turning module-level singletons (e.g.
+    ``vllm.compilation.counter.compilation_counter``) into stale clones in
+    the child, so increments performed by the production code in the child
+    would never be observable to the test.
+
+    The child inherits the parent's stdout/stderr so its output (engine
+    cores, NCCL, CUDA, ...) reaches the test runner live; the Python-level
+    traceback is serialized to ``tb_file`` for structured re-raising. A
+    native crash leaves ``tb_file`` empty — the diagnostic is then only in
+    the inherited subprocess output.
+    """
 
     @functools.wraps(f)
     def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
-        # Check if we're already in a subprocess
-        if os.environ.get("RUNNING_IN_SUBPROCESS") == "1":
-            # If we are, just run the function directly
+        if os.environ.get(_SPAWN_CHILD_ENV) == "1":
             return f(*args, **kwargs)
 
-        import torch.multiprocessing as mp
-
-        with suppress(RuntimeError):
-            mp.set_start_method("spawn")
-
-        # Get the module
-        module_name = f.__module__
-
-        # Create a process with environment variable set
-        env = os.environ.copy()
-        env["RUNNING_IN_SUBPROCESS"] = "1"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".tb", mode="wb") as tmp:
+            tb_file = tmp.name
 
-        with tempfile.TemporaryDirectory() as tempdir:
-            output_filepath = os.path.join(tempdir, "new_process.tmp")
+        try:
+            payload = cloudpickle.dumps(
+                {
+                    "module": f.__module__,
+                    "qualname": f.__qualname__,
+                    "args": args,
+                    "kwargs": kwargs,
+                    "tb_file": tb_file,
+                }
+            )
 
-            # `cloudpickle` allows pickling complex functions directly
-            input_bytes = cloudpickle.dumps((f, output_filepath))
+            child_script = (
+                "import sys, importlib, cloudpickle, traceback\n"
+                "try:\n"
+                "    from _pytest.outcomes import Skipped\n"
+                "except ImportError:\n"
+                "    class Skipped(BaseException): pass\n"
+                "data = cloudpickle.loads(sys.stdin.buffer.read())\n"
+                "mod = importlib.import_module(data['module'])\n"
+                "target = mod\n"
+                "for name in data['qualname'].split('.'):\n"
+                "    target = getattr(target, name)\n"
+                "try:\n"
+                "    target(*data['args'], **data['kwargs'])\n"
+                "except Skipped:\n"
+                "    sys.exit(0)\n"
+                "except BaseException:\n"
+                "    with open(data['tb_file'], 'w') as fp:\n"
+                "        fp.write(traceback.format_exc())\n"
+                "    sys.exit(1)\n"
+            )
 
             repo_root = str(VLLM_PATH.resolve())
-
-            env = dict(env or os.environ)
+            env = os.environ.copy()
             env["PYTHONPATH"] = repo_root + os.pathsep + env.get("PYTHONPATH", "")
+            env[_SPAWN_CHILD_ENV] = "1"
 
-            cmd = [sys.executable, "-m", f"{module_name}"]
-
-            returned = subprocess.run(
-                cmd, input=input_bytes, capture_output=True, env=env
+            result = subprocess.run(
+                [sys.executable, "-c", child_script],
+                input=payload,
+                env=env,
             )
 
-            # check if the subprocess is successful
-            try:
-                returned.check_returncode()
-            except Exception as e:
-                # wrap raised exception to provide more information
+            if result.returncode != 0:
+                try:
+                    with open(tb_file) as fp:
+                        tb = fp.read()
+                except OSError:
+                    tb = ""
+                if not tb:
+                    tb = "<no Python traceback; see subprocess output above>"
                 raise RuntimeError(
-                    f"Error raised in subprocess:\n{returned.stderr.decode()}"
-                ) from e
+                    f"Test subprocess '{f.__name__}' failed "
+                    f"({_format_subprocess_exit(result.returncode)}):\n{tb}"
+                )
+        finally:
+            with contextlib.suppress(OSError):
+                os.remove(tb_file)
 
     return wrapper
 
diff --git a/tests/utils_/test_spawn_decorator.py b/tests/utils_/test_spawn_decorator.py
new file mode 100644
index 000000000000..1740ea30de94
--- /dev/null
+++ b/tests/utils_/test_spawn_decorator.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for spawn_new_process_for_each_test decorator."""
+
+import pytest
+
+from tests.utils import spawn_new_process_for_each_test
+
+
+@spawn_new_process_for_each_test
+def test_spawn_decorator_passing():
+    """Passing function should complete normally."""
+    assert 1 + 1 == 2
+
+
+@pytest.mark.xfail(raises=RuntimeError, strict=True)
+@spawn_new_process_for_each_test
+def test_spawn_decorator_failure_is_caught():
+    """Failing function should raise RuntimeError, never silently pass."""
+    raise ValueError("intentional failure")
+
+
+@spawn_new_process_for_each_test
+def test_spawn_decorator_skip():
+    """pytest.skip inside subprocess should propagate correctly."""
+    pytest.skip("intentional skip")
+
+
+@spawn_new_process_for_each_test
+@pytest.mark.parametrize("x,y,expected", [(1, 2, 3), (0, 0, 0)])
+def test_spawn_decorator_parametrized(x, y, expected):
+    """Args and kwargs must be forwarded correctly to subprocess."""
+    assert x + y == expected
diff --git a/tests/v1/attention/test_kv_head_stride_canonicalization.py b/tests/v1/attention/test_kv_head_stride_canonicalization.py
new file mode 100644
index 000000000000..635f46390cfc
--- /dev/null
+++ b/tests/v1/attention/test_kv_head_stride_canonicalization.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for canonicalize_singleton_dim_strides.
+
+Background
+----------
+When num_kv_heads_per_rank == 1 (e.g. Qwen3.5-397B with TP=8 → 1 KV head
+per rank), PyTorch's is_contiguous() returns True for *any* stride on the
+size-1 dimension.  The KV cache allocator can therefore produce a tensor
+where that singleton dim has stride = 1 element (2 bytes for bf16) instead
+of the canonical product-of-remaining-dims value.
+
+CUDA TMA (used by FlashInfer XQA SM90 and Flash-Attention 3/4 on H100+)
+requires all non-outermost strides to be multiples of 16 bytes.  A 2-byte
+stride triggers cudaErrorIllegalInstruction.
+
+canonicalize_singleton_dim_strides() patches degenerate strides on all
+size-1 dimensions via torch.as_strided — zero-copy.
+
+The degenerate stride manifests at different positions in different backends:
+- FlashInfer: stride(-3) after kv_cache.permute() → shape [..., 1, B, D]
+- FlashAttention: stride(-2) after kv_cache.unbind(0) → shape [N, B, 1, D]
+"""
+
+import torch
+
+from vllm.utils.torch_utils import canonicalize_singleton_dim_strides
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _inject_degenerate_stride(t: torch.Tensor, dim: int) -> torch.Tensor:
+    """Return a view of t with a degenerate (stride=1) on a size-1 dim."""
+    assert t.shape[dim] == 1, f"dim {dim} must have size 1"
+    strides = list(t.stride())
+    strides[dim] = 1  # inject the bug
+    return t.as_strided(t.shape, strides)
+
+
+# ---------------------------------------------------------------------------
+# Tests: canonicalize_singleton_dim_strides
+# ---------------------------------------------------------------------------
+
+
+class TestCanonicalizeSingletonDimStrides:
+    def test_flashinfer_layout_dim_neg3(self):
+        """FlashInfer path: degenerate stride at dim -3 (num_kv_heads)."""
+        # Shape after permute: [num_blocks, 2, num_kv_heads, block_size, head_size]
+        num_blocks, block_size, head_size = 64, 16, 128
+        t = torch.zeros(num_blocks, 2, 1, block_size, head_size, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+
+        assert t_deg.stride(-3) == 1  # confirm degenerate
+        assert t_deg.is_contiguous()  # PyTorch doesn't notice
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(-3) == block_size * head_size  # canonical = 2048
+        assert fixed.stride(-2) == head_size  # inner dims unchanged
+        assert fixed.stride(-1) == 1
+
+    def test_flash_attn_layout_dim_neg2(self):
+        """FlashAttention path: degenerate stride at dim -2 (num_kv_heads)."""
+        # Shape after unbind(0): [num_blocks, block_size, num_kv_heads, head_size]
+        num_blocks, block_size, head_size = 64, 16, 128
+        t = torch.zeros(num_blocks, block_size, 1, head_size, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-2)
+
+        assert t_deg.stride(-2) == 1
+        assert t_deg.is_contiguous()
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(-2) == head_size  # canonical = 128
+        assert fixed.stride(-1) == 1
+
+    def test_canonical_strides_returned_as_is(self):
+        """No degenerate strides → same object returned (no copy, no new view)."""
+        t = torch.zeros(64, 2, 1, 16, 128, dtype=torch.bfloat16)
+        result = canonicalize_singleton_dim_strides(t)
+        assert result is t
+
+    def test_multi_kv_heads_unchanged(self):
+        """num_kv_heads > 1 → strides are already canonical → unchanged."""
+        t = torch.zeros(16, 2, 4, 16, 128, dtype=torch.bfloat16)
+        original_strides = t.stride()
+        result = canonicalize_singleton_dim_strides(t)
+        assert result.stride() == original_strides
+
+    def test_data_pointer_preserved(self):
+        """Fix is zero-copy: same underlying storage."""
+        t = torch.zeros(8, 2, 1, 16, 128, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+        assert fixed.data_ptr() == t_deg.data_ptr()
+        assert fixed.storage_offset() == t_deg.storage_offset()
+
+    def test_multiple_singleton_dims(self):
+        """All size-1 dims with degenerate strides are fixed."""
+        # Shape: [1, 1, 8, 32] — two size-1 dims
+        t = torch.zeros(1, 1, 8, 32, dtype=torch.float16)
+        # Both size-1 dims get degenerate strides
+        t_deg = t.as_strided(t.shape, (1, 1, 32, 1))  # both leading dims = 1
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        assert fixed.stride(0) == 1 * 8 * 32  # canonical: 256
+        assert fixed.stride(1) == 1 * 8 * 32  # canonical: 256 (same since size-1)
+        assert fixed.stride(2) == 32
+        assert fixed.stride(3) == 1
+
+    def test_various_shapes_flashinfer(self):
+        """Correctness across different block_size / head_size for FlashInfer layout."""
+        for block_size, head_size in [(16, 64), (16, 128), (32, 128), (16, 256)]:
+            t = torch.zeros(8, 2, 1, block_size, head_size, dtype=torch.bfloat16)
+            t_deg = _inject_degenerate_stride(t, dim=-3)
+            fixed = canonicalize_singleton_dim_strides(t_deg)
+            assert fixed.stride(-3) == block_size * head_size, (
+                f"Failed for block_size={block_size}, head_size={head_size}: "
+                f"got stride(-3)={fixed.stride(-3)}"
+            )
+
+    def test_various_shapes_flash_attn(self):
+        """Correctness across different shapes for FlashAttention layout."""
+        for block_size, head_size in [(16, 64), (16, 128), (32, 128)]:
+            t = torch.zeros(8, block_size, 1, head_size, dtype=torch.bfloat16)
+            t_deg = _inject_degenerate_stride(t, dim=-2)
+            fixed = canonicalize_singleton_dim_strides(t_deg)
+            assert fixed.stride(-2) == head_size, (
+                f"Failed for block_size={block_size}, head_size={head_size}: "
+                f"got stride(-2)={fixed.stride(-2)}"
+            )
+
+    def test_tma_alignment_satisfied_after_fix_bf16(self):
+        """After fix, all strides meet 16-byte TMA alignment for bf16."""
+        t = torch.zeros(64, 2, 1, 16, 128, dtype=torch.bfloat16)
+        t_deg = _inject_degenerate_stride(t, dim=-3)
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        element_size = fixed.element_size()  # 2 bytes for bf16
+        for i, s in enumerate(fixed.stride()):
+            assert (s * element_size) % 16 == 0 or i == len(fixed.stride()) - 1, (
+                f"dim {i} stride {s} * {element_size} bytes not 16-byte aligned"
+            )
+
+    def test_non_contiguous_outer_dims_preserved(self):
+        """Outer (non-size-1) non-contiguous strides are left unchanged."""
+        # Simulate cross-layer unified allocation: num_blocks stride is non-canonical
+        # but the inner dims should be fixed.
+        base = torch.zeros(200, 2, 1, 16, 128, dtype=torch.bfloat16)
+        # Slice every 2nd block → non-canonical outer stride
+        t_sliced = base[::2]  # shape [100, 2, 1, 16, 128], stride[0] = 2*canonical
+        t_deg = _inject_degenerate_stride(t_sliced, dim=-3)
+
+        fixed = canonicalize_singleton_dim_strides(t_deg)
+
+        # Outer stride should be unchanged (not a size-1 dim)
+        assert fixed.stride(0) == t_sliced.stride(0)
+        # Inner degenerate stride should be fixed
+        assert fixed.stride(-3) == 16 * 128
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index f91ea85779d5..3807ee69ecfc 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -30,6 +30,7 @@
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.attention.backend import CommonAttentionMetadata
 from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla
+from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported
 from vllm.v1.kv_cache_interface import MLAAttentionSpec
@@ -621,6 +622,19 @@ def run_attention_backend(
             k_scale=k_scale,
         )
 
+        # Attach prefill backend (normally created by MLAAttention.__init__)
+        prefill_scale = (qk_nope_head_dim + qk_rope_head_dim) ** -0.5
+        prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+        mock_layer.prefill_backend = prefill_backend_cls(
+            num_heads=num_heads,
+            scale=prefill_scale,
+            kv_lora_rank=kv_lora_rank,
+            qk_nope_head_dim=qk_nope_head_dim,
+            qk_rope_head_dim=qk_rope_head_dim,
+            v_head_dim=v_head_dim,
+            vllm_config=vllm_config,
+        )
+
         # Populate static_forward_context with mock attention layers
         for layer_name in layer_names:
             vllm_config.compilation_config.static_forward_context[layer_name] = (
@@ -785,7 +799,9 @@ def test_backend_correctness(
     assert kv_lora_rank + qk_rope_head_dim == head_size, (
         f"MLA dimensions don't match: {total_head_size} != {head_size}"
     )
-    scale = 1.0 / (total_head_size**0.5)
+    decode_scale = 1.0 / (total_head_size**0.5)
+    qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+    prefill_scale = qk_head_dim**-0.5
 
     # 2. Generate data and compute SDPA reference output for MLA
     all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], []
@@ -902,7 +918,7 @@ def test_backend_correctness(
         v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2)
 
         sdpa_out_i_decode = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=decode_scale
         )
         sdpa_out_i_decode = sdpa_out_i_decode.transpose(1, 2).squeeze(
             0
@@ -938,7 +954,7 @@ def test_backend_correctness(
 
         # Single attention call with custom mask
         sdpa_out_i_prefill = torch.nn.functional.scaled_dot_product_attention(
-            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale
+            q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=prefill_scale
         )
         sdpa_out_i_prefill = sdpa_out_i_prefill.transpose(1, 2).squeeze(0)
         sdpa_out_i_prefill = sdpa_out_i_prefill.flatten(start_dim=-2)
diff --git a/tests/v1/core/test_block_pool.py b/tests/v1/core/test_block_pool.py
new file mode 100644
index 000000000000..2521cb2d0a7b
--- /dev/null
+++ b/tests/v1/core/test_block_pool.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.v1.core.block_pool import BlockPool, CompactBlockPool
+
+pytestmark = pytest.mark.cpu_test
+
+
+def test_block_pool_reserves_zero_as_null_block():
+    block_pool = BlockPool(
+        num_gpu_blocks=4,
+        enable_caching=False,
+        hash_block_size=16,
+    )
+
+    assert block_pool.null_block.block_id == 0
+    assert block_pool.null_block.is_null
+    assert block_pool.get_num_free_blocks() == 3
+    assert block_pool.get_usage() == 0
+
+
+def test_block_pool_allocation_never_returns_null_block():
+    block_pool = BlockPool(
+        num_gpu_blocks=4,
+        enable_caching=False,
+        hash_block_size=16,
+    )
+
+    blocks = block_pool.get_new_blocks(3)
+
+    assert {block.block_id for block in blocks} == {1, 2, 3}
+    assert all(not block.is_null for block in blocks)
+    assert block_pool.get_num_free_blocks() == 0
+    assert block_pool.get_usage() == 1
+
+
+def test_block_pool_exhaustion_raises_without_allocating_null_block():
+    block_pool = BlockPool(
+        num_gpu_blocks=2,
+        enable_caching=False,
+        hash_block_size=16,
+    )
+
+    with pytest.raises(ValueError, match="Cannot get 2 free blocks"):
+        block_pool.get_new_blocks(2)
+
+    block = block_pool.get_new_blocks(1)[0]
+    assert block.block_id == 1
+    assert block_pool.null_block.block_id == 0
+
+
+def test_block_pool_free_returns_blocks_but_not_null_block():
+    block_pool = BlockPool(
+        num_gpu_blocks=3,
+        enable_caching=False,
+        hash_block_size=16,
+    )
+    blocks = block_pool.get_new_blocks(2)
+
+    block_pool.free_blocks(reversed(blocks))
+
+    assert block_pool.get_num_free_blocks() == 2
+    assert block_pool.get_usage() == 0
+    reallocated = block_pool.get_new_blocks(2)
+    assert {block.block_id for block in reallocated} == {1, 2}
+
+
+@pytest.mark.parametrize(
+    "pool",
+    [
+        BlockPool(num_gpu_blocks=4, enable_caching=False, hash_block_size=16),
+        CompactBlockPool(num_allocatable=3),
+    ],
+)
+def test_block_pool_protocol_conformance(pool):
+    assert pool.num_gpu_blocks == 4
+    assert pool.null_block.block_id == 0
+    assert pool.null_block.is_null
+
+    blocks = pool.get_new_blocks(1)
+
+    assert len(blocks) == 1
+    assert blocks[0].block_id != 0
+    assert not blocks[0].is_null
+    pool.free_blocks(blocks)
+
+
+def test_compact_block_pool_reserves_zero_as_null_block():
+    block_pool = CompactBlockPool(num_allocatable=3)
+
+    assert block_pool.num_gpu_blocks == 4
+    assert block_pool.null_block.block_id == 0
+    assert block_pool.null_block.is_null
+    assert block_pool.get_num_free_blocks() == 3
+    assert block_pool.get_usage() == 0
+
+
+def test_compact_block_pool_allocation_never_returns_null_block():
+    block_pool = CompactBlockPool(num_allocatable=3)
+
+    blocks = block_pool.get_new_blocks(3)
+
+    assert {block.block_id for block in blocks} == {1, 2, 3}
+    assert all(not block.is_null for block in blocks)
+    assert all(block.ref_cnt == 1 for block in blocks)
+    assert block_pool.get_num_free_blocks() == 0
+    assert block_pool.get_usage() == 1
+
+
+def test_compact_block_pool_zero_arg_operations_are_noops():
+    block_pool = CompactBlockPool(num_allocatable=1)
+
+    assert block_pool.get_new_blocks(0) == []
+    block_pool.free_blocks([])
+
+    assert block_pool.get_num_free_blocks() == 1
+    assert block_pool.get_usage() == 0
+
+
+def test_compact_block_pool_exhaustion_matches_shared_pool_error_type():
+    block_pool = CompactBlockPool(num_allocatable=1)
+
+    with pytest.raises(ValueError, match="Cannot get 2 free blocks"):
+        block_pool.get_new_blocks(2)
+
+    block = block_pool.get_new_blocks(1)[0]
+    assert block.block_id == 1
+    assert block_pool.null_block.block_id == 0
+
+
+def test_compact_block_pool_free_returns_blocks_to_pool():
+    block_pool = CompactBlockPool(num_allocatable=2)
+    blocks = block_pool.get_new_blocks(2)
+
+    block_pool.free_blocks(reversed(blocks))
+
+    assert all(block.ref_cnt == 0 for block in blocks)
+    assert block_pool.get_num_free_blocks() == 2
+    assert block_pool.get_usage() == 0
+    reallocated = block_pool.get_new_blocks(2)
+    assert {block.block_id for block in reallocated} == {1, 2}
+    assert all(block.ref_cnt == 1 for block in reallocated)
+
+
+def test_compact_block_pool_rejects_freeing_null_block():
+    block_pool = CompactBlockPool(num_allocatable=1)
+
+    with pytest.raises(AssertionError, match="null block must never be freed"):
+        block_pool.free_blocks([block_pool.null_block])
+
+
+def test_compact_block_pool_rejects_freeing_unallocated_block():
+    block_pool = CompactBlockPool(num_allocatable=1)
+    block = block_pool.get_new_blocks(1)[0]
+    block_pool.free_blocks([block])
+
+    with pytest.raises(AssertionError, match="binary ref_cnt semantics"):
+        block_pool.free_blocks([block])
+
+
+def test_compact_block_pool_allows_zero_allocatable_blocks():
+    block_pool = CompactBlockPool(num_allocatable=0)
+
+    assert block_pool.num_gpu_blocks == 1
+    assert block_pool.null_block.block_id == 0
+    assert block_pool.get_num_free_blocks() == 0
+    assert block_pool.get_usage() == 0
+    assert block_pool.get_new_blocks(0) == []
+    with pytest.raises(ValueError, match="Cannot get 1 free blocks"):
+        block_pool.get_new_blocks(1)
diff --git a/tests/v1/core/test_kv_cache_coordinator.py b/tests/v1/core/test_kv_cache_coordinator.py
new file mode 100644
index 000000000000..7b16abb79221
--- /dev/null
+++ b/tests/v1/core/test_kv_cache_coordinator.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.block_pool import BlockPool, CompactBlockPool
+from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
+from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheGroupSpec,
+    KVCachePoolConfig,
+    KVCacheTensor,
+    MambaSpec,
+    MemoryModel,
+    SlidingWindowSpec,
+)
+from vllm.v1.request import Request
+
+pytestmark = pytest.mark.cpu_test
+
+
+def make_full_attention_spec(block_size: int = 16) -> FullAttentionSpec:
+    return FullAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float32,
+    )
+
+
+def make_sliding_window_spec(block_size: int = 16) -> SlidingWindowSpec:
+    return SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float32,
+        sliding_window=128,
+    )
+
+
+def make_mamba_spec(block_size: int = 16) -> MambaSpec:
+    return MambaSpec(
+        block_size=block_size,
+        shapes=((1,),),
+        dtypes=(torch.float32,),
+    )
+
+
+def make_coordinator(config: KVCacheConfig, enable_caching: bool = False):
+    return get_kv_cache_coordinator(
+        kv_cache_config=config,
+        max_model_len=128,
+        max_num_batched_tokens=128,
+        use_eagle=False,
+        enable_caching=enable_caching,
+        enable_kv_cache_events=False,
+        dcp_world_size=1,
+        pcp_world_size=1,
+        hash_block_size=16,
+    )
+
+
+def make_kv_cache_manager_config(
+    block_size: int = 4, num_blocks: int = 3
+) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer"], make_full_attention_spec(block_size))
+        ],
+    )
+
+
+def make_multi_pool_kv_cache_manager_config() -> KVCacheConfig:
+    block_size = 4
+    full_spec = make_full_attention_spec(block_size)
+    mamba_spec = make_mamba_spec(block_size)
+    return KVCacheConfig(
+        num_blocks=3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["attention_layer"], full_spec),
+            KVCacheGroupSpec(["mamba_layer"], mamba_spec),
+        ],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=3,
+                accounting_page_size_bytes=full_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=full_spec.physical_page_size_bytes,
+            ),
+            KVCachePoolConfig(
+                pool_id=1,
+                memory_model=MemoryModel.REQUEST_CONSTANT,
+                group_ids=(1,),
+                num_blocks=2,
+                accounting_page_size_bytes=mamba_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=mamba_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0, 1),
+    )
+
+
+def make_request(request_id: str = "request", num_tokens: int = 4) -> Request:
+    return Request(
+        request_id=request_id,
+        prompt_token_ids=[0] * num_tokens,
+        sampling_params=SamplingParams(max_tokens=1),
+        pooling_params=None,
+    )
+
+
+def test_attention_free_config_keeps_legacy_shared_pool_alias():
+    coordinator = make_coordinator(
+        KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
+    )
+
+    assert coordinator.block_pool is coordinator._block_pools[0]
+    assert coordinator._group_to_pool == ()
+    assert coordinator.single_type_managers == ()
+    assert coordinator.get_num_free_blocks_by_pool() == ()
+    # The legacy coordinator still owns one shared BlockPool, but
+    # attention-free configs have no KV cache pool metadata and no managers.
+    assert (
+        coordinator.get_num_blocks_to_allocate_by_pool(
+            request_id="request",
+            num_tokens=32,
+            new_computed_blocks=(),
+            num_encoder_tokens=0,
+            total_computed_tokens=0,
+            num_tokens_main_model=32,
+        )
+        == ()
+    )
+    assert (
+        coordinator.get_num_blocks_to_allocate(
+            request_id="request",
+            num_tokens=32,
+            new_computed_blocks=(),
+            num_encoder_tokens=0,
+            total_computed_tokens=0,
+            num_tokens_main_model=32,
+        )
+        == 0
+    )
+
+
+def test_single_group_config_maps_manager_to_legacy_pool_alias():
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[KVCacheTensor(size=100, shared_by=["layer_0"])],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_0"], make_full_attention_spec())],
+    )
+
+    coordinator = make_coordinator(config, enable_caching=True)
+
+    assert coordinator.block_pool is coordinator._block_pools[0]
+    assert coordinator._group_to_pool == (coordinator.block_pool,)
+    assert coordinator.single_type_managers[0].block_pool is coordinator.block_pool
+    assert coordinator.get_num_free_blocks_by_pool() == (9,)
+    assert coordinator.get_num_blocks_to_allocate_by_pool(
+        request_id="request",
+        num_tokens=32,
+        new_computed_blocks=((),),
+        num_encoder_tokens=0,
+        total_computed_tokens=0,
+        num_tokens_main_model=32,
+    ) == (2,)
+    assert (
+        coordinator.get_num_blocks_to_allocate(
+            request_id="request",
+            num_tokens=32,
+            new_computed_blocks=((),),
+            num_encoder_tokens=0,
+            total_computed_tokens=0,
+            num_tokens_main_model=32,
+        )
+        == 2
+    )
+
+
+def test_multi_group_single_pool_config_maps_all_managers_to_legacy_pool_alias():
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[
+            KVCacheTensor(size=100, shared_by=["layer_0"]),
+            KVCacheTensor(size=100, shared_by=["layer_1"]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_0"], make_full_attention_spec()),
+            KVCacheGroupSpec(["layer_1"], make_sliding_window_spec()),
+        ],
+    )
+
+    coordinator = make_coordinator(config)
+
+    assert coordinator.block_pool is coordinator._block_pools[0]
+    assert coordinator._group_to_pool == (
+        coordinator.block_pool,
+        coordinator.block_pool,
+    )
+    assert all(
+        manager.block_pool is coordinator.block_pool
+        for manager in coordinator.single_type_managers
+    )
+    assert coordinator.get_num_free_blocks_by_pool() == (9,)
+    assert coordinator.get_num_blocks_to_allocate_by_pool(
+        request_id="request",
+        num_tokens=32,
+        new_computed_blocks=((), ()),
+        num_encoder_tokens=0,
+        total_computed_tokens=0,
+        num_tokens_main_model=32,
+    ) == (4,)
+    assert (
+        coordinator.get_num_blocks_to_allocate(
+            request_id="request",
+            num_tokens=32,
+            new_computed_blocks=((), ()),
+            num_encoder_tokens=0,
+            total_computed_tokens=0,
+            num_tokens_main_model=32,
+        )
+        == 4
+    )
+
+
+def test_multi_pool_config_without_prefix_cache_builds_configured_pools():
+    full_spec = make_full_attention_spec()
+    mamba_spec = make_mamba_spec()
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_0"], full_spec),
+            KVCacheGroupSpec(["layer_1"], mamba_spec),
+        ],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=10,
+                accounting_page_size_bytes=full_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=full_spec.physical_page_size_bytes,
+            ),
+            KVCachePoolConfig(
+                pool_id=1,
+                memory_model=MemoryModel.REQUEST_CONSTANT,
+                group_ids=(1,),
+                num_blocks=3,
+                accounting_page_size_bytes=mamba_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=mamba_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0, 1),
+    )
+
+    coordinator = make_coordinator(config, enable_caching=False)
+
+    assert isinstance(coordinator._block_pools[0], BlockPool)
+    assert isinstance(coordinator._block_pools[1], CompactBlockPool)
+    assert coordinator.block_pool is coordinator._block_pools[0]
+    assert coordinator._group_to_pool == coordinator._block_pools
+    assert coordinator.get_num_free_blocks_by_pool() == (9, 2)
+    # Full attention remains token-proportional: 32 tokens / block_size 16 = 2.
+    # Request-constant Mamba uses one compact state block per request.
+    assert coordinator.get_num_blocks_to_allocate_by_pool(
+        request_id="request",
+        num_tokens=32,
+        new_computed_blocks=((), ()),
+        num_encoder_tokens=0,
+        total_computed_tokens=0,
+        num_tokens_main_model=32,
+    ) == (2, 1)
+
+
+def test_has_enough_free_blocks_by_pool_uses_pool_tuple():
+    manager = KVCacheManager(
+        kv_cache_config=make_kv_cache_manager_config(num_blocks=3),
+        max_model_len=16,
+        hash_block_size=4,
+        enable_caching=False,
+    )
+
+    # num_blocks=3 includes the null sentinel, leaving two allocatable blocks.
+    assert manager.coordinator.get_num_free_blocks_by_pool() == (2,)
+    assert manager._has_enough_free_blocks_by_pool((2,))
+    assert not manager._has_enough_free_blocks_by_pool((3,))
+
+
+def test_allocate_slots_uses_pool_aware_accounting_path():
+    manager = KVCacheManager(
+        kv_cache_config=make_kv_cache_manager_config(num_blocks=3),
+        max_model_len=16,
+        hash_block_size=4,
+        enable_caching=False,
+    )
+    request = make_request(num_tokens=4)
+    manager.coordinator.get_num_blocks_to_allocate = Mock(
+        side_effect=AssertionError("legacy scalar accounting should not be used")
+    )
+    get_num_blocks_to_allocate_by_pool = Mock(
+        wraps=manager.coordinator.get_num_blocks_to_allocate_by_pool
+    )
+    manager.coordinator.get_num_blocks_to_allocate_by_pool = (
+        get_num_blocks_to_allocate_by_pool
+    )
+
+    blocks = manager.allocate_slots(request, num_new_tokens=4)
+
+    assert blocks is not None
+    assert blocks.get_block_ids() == ([1],)
+    manager.coordinator.get_num_blocks_to_allocate.assert_not_called()
+    get_num_blocks_to_allocate_by_pool.assert_called_once()
+
+
+def test_allocate_slots_checks_each_pool_independently():
+    manager = KVCacheManager(
+        kv_cache_config=make_multi_pool_kv_cache_manager_config(),
+        max_model_len=16,
+        hash_block_size=4,
+        enable_caching=False,
+    )
+
+    blocks = manager.allocate_slots(make_request("first", num_tokens=4), 4)
+
+    assert blocks is not None
+    assert blocks.get_block_ids() == ([1], [1])
+    assert manager.coordinator.get_num_free_blocks_by_pool() == (1, 0)
+    assert manager.allocate_slots(make_request("second", num_tokens=4), 4) is None
+
+
+def test_multi_pool_config_with_prefix_cache_is_rejected_until_pool_aware():
+    full_spec = make_full_attention_spec()
+    sliding_spec = make_sliding_window_spec()
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_0"], full_spec),
+            KVCacheGroupSpec(["layer_1"], sliding_spec),
+        ],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=10,
+                accounting_page_size_bytes=full_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=full_spec.physical_page_size_bytes,
+            ),
+            KVCachePoolConfig(
+                pool_id=1,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(1,),
+                num_blocks=10,
+                accounting_page_size_bytes=sliding_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=sliding_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0, 1),
+    )
+
+    with pytest.raises(NotImplementedError, match="prefix caching is disabled"):
+        make_coordinator(config, enable_caching=True)
diff --git a/tests/v1/core/test_kv_cache_invariants.py b/tests/v1/core/test_kv_cache_invariants.py
new file mode 100644
index 000000000000..72b6df5eba92
--- /dev/null
+++ b/tests/v1/core/test_kv_cache_invariants.py
@@ -0,0 +1,188 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+
+import pytest
+import torch
+
+from vllm.utils.torch_utils import get_dtype_size
+from vllm.v1.kv_cache_interface import (
+    ChunkedLocalAttentionSpec,
+    CrossAttentionSpec,
+    EncoderOnlyAttentionSpec,
+    FullAttentionSpec,
+    KVCacheSpec,
+    MambaSpec,
+    MemoryModel,
+    MLAAttentionSpec,
+    SinkFullAttentionSpec,
+    SlidingWindowSpec,
+    UniformTypeKVCacheSpecs,
+)
+
+pytestmark = pytest.mark.cpu_test
+
+
+def _full_attention_spec() -> FullAttentionSpec:
+    return FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+    )
+
+
+def _mla_attention_spec() -> MLAAttentionSpec:
+    return MLAAttentionSpec(
+        block_size=16,
+        num_kv_heads=1,
+        head_size=64,
+        dtype=torch.float16,
+    )
+
+
+def _chunked_local_attention_spec() -> ChunkedLocalAttentionSpec:
+    return ChunkedLocalAttentionSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+        attention_chunk_size=128,
+    )
+
+
+def _sliding_window_spec() -> SlidingWindowSpec:
+    return SlidingWindowSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+        sliding_window=128,
+    )
+
+
+def _mamba_spec() -> MambaSpec:
+    return MambaSpec(
+        block_size=16,
+        shapes=((4, 8), (2, 8)),
+        dtypes=(torch.float16, torch.float32),
+    )
+
+
+def _encoder_only_attention_spec() -> EncoderOnlyAttentionSpec:
+    return EncoderOnlyAttentionSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+    )
+
+
+def _cross_attention_spec() -> CrossAttentionSpec:
+    return CrossAttentionSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+    )
+
+
+def _sink_full_attention_spec() -> SinkFullAttentionSpec:
+    return SinkFullAttentionSpec(
+        block_size=16,
+        num_kv_heads=2,
+        head_size=64,
+        dtype=torch.float16,
+        sink_len=4,
+    )
+
+
+def _uniform_type_kv_cache_specs() -> UniformTypeKVCacheSpecs:
+    return UniformTypeKVCacheSpecs(
+        block_size=16,
+        kv_cache_specs={
+            "layer.0": _full_attention_spec(),
+            "layer.1": _full_attention_spec(),
+        },
+    )
+
+
+TOKEN_PROPORTIONAL_SPEC_FACTORIES: list[tuple[str, Callable[[], KVCacheSpec]]] = [
+    ("FullAttentionSpec", _full_attention_spec),
+    ("MLAAttentionSpec", _mla_attention_spec),
+    ("ChunkedLocalAttentionSpec", _chunked_local_attention_spec),
+    ("SlidingWindowSpec", _sliding_window_spec),
+    ("EncoderOnlyAttentionSpec", _encoder_only_attention_spec),
+    ("CrossAttentionSpec", _cross_attention_spec),
+    ("SinkFullAttentionSpec", _sink_full_attention_spec),
+    ("UniformTypeKVCacheSpecs", _uniform_type_kv_cache_specs),
+]
+
+
+@pytest.mark.parametrize(
+    ("spec_name", "make_spec"),
+    TOKEN_PROPORTIONAL_SPEC_FACTORIES,
+    ids=[name for name, _ in TOKEN_PROPORTIONAL_SPEC_FACTORIES],
+)
+def test_default_memory_model_is_token_proportional(
+    spec_name: str, make_spec: Callable[[], KVCacheSpec]
+) -> None:
+    spec = make_spec()
+
+    assert spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL, spec_name
+    assert spec.accounting_page_size_bytes == spec.page_size_bytes, spec_name
+    assert spec.requires_block_zeroing_on_alloc is True, spec_name
+
+
+@pytest.mark.parametrize("mamba_cache_mode", ["none", "align", "all"])
+def test_mamba_zeroing_metadata_matches_current_zeroer(
+    mamba_cache_mode: str,
+) -> None:
+    spec = MambaSpec(
+        block_size=16,
+        shapes=((4, 8), (2, 8)),
+        dtypes=(torch.float16, torch.float32),
+        mamba_cache_mode=mamba_cache_mode,
+    )
+
+    expected_memory_model = (
+        MemoryModel.TOKEN_PROPORTIONAL
+        if mamba_cache_mode == "all"
+        else MemoryModel.REQUEST_CONSTANT
+    )
+    assert spec.memory_model == expected_memory_model
+    assert spec.accounting_page_size_bytes == spec.page_size_bytes
+    assert spec.requires_block_zeroing_on_alloc is False
+
+
+def test_mamba_physical_page_size_excludes_accounting_padding() -> None:
+    spec = MambaSpec(
+        block_size=16,
+        shapes=((4, 8), (2, 8)),
+        dtypes=(torch.float16, torch.float32),
+        page_size_padded=1024,
+    )
+    expected_physical = (4 * 8 * get_dtype_size(torch.float16)) + (
+        2 * 8 * get_dtype_size(torch.float32)
+    )
+
+    assert spec.physical_page_size_bytes == expected_physical
+    assert spec.page_size_bytes == 1024
+    assert spec.accounting_page_size_bytes == 1024
+
+
+def test_uniform_type_physical_page_size_sums_children() -> None:
+    full_spec = _full_attention_spec()
+    mla_spec = _mla_attention_spec()
+    uniform_spec = UniformTypeKVCacheSpecs(
+        block_size=16,
+        kv_cache_specs={
+            "full": full_spec,
+            "mla": mla_spec,
+        },
+    )
+
+    assert uniform_spec.physical_page_size_bytes == (
+        full_spec.physical_page_size_bytes + mla_spec.physical_page_size_bytes
+    )
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 985b97c69ca4..687d88d30460 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,6 +3,7 @@
 import hashlib
 import importlib
 from collections.abc import Callable
+from dataclasses import dataclass
 from typing import Any
 
 import pytest
@@ -10,6 +11,7 @@
 
 import vllm.v1.core.kv_cache_utils as kv_cache_utils
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.config.compilation import CompilationConfig, CUDAGraphMode
 from vllm.config.kv_events import KVEventsConfig
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import (
@@ -20,6 +22,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils.hashing import sha256, sha256_cbor
 from vllm.utils.mem_constants import GiB_bytes
+from vllm.v1.attention.backend import AttentionCGSupport
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
@@ -31,6 +34,7 @@
     get_kv_cache_configs,
     get_max_concurrency_for_kv_cache_config,
     get_request_block_hasher,
+    get_token_proportional_kv_cache_capacity_tokens,
     hash_block_tokens,
     init_none_hash,
     is_kv_cache_spec_uniform,
@@ -42,9 +46,11 @@
     FullAttentionSpec,
     KVCacheConfig,
     KVCacheGroupSpec,
+    KVCachePoolConfig,
     KVCacheSpec,
     KVCacheTensor,
     MambaSpec,
+    MemoryModel,
     MLAAttentionSpec,
     SlidingWindowSpec,
     UniformTypeKVCacheSpecs,
@@ -177,6 +183,151 @@ def new_mamba_spec(
     )
 
 
+@dataclass(frozen=True)
+class _DummyRequestConstantSpec(MambaSpec):
+    """Test-only spec for generated request-constant config paths."""
+
+    @property
+    def memory_model(self) -> MemoryModel:
+        return MemoryModel.REQUEST_CONSTANT
+
+    @property
+    def blocks_per_request(self) -> int:
+        return 1 + self.num_speculative_blocks
+
+
+def new_request_constant_spec(
+    block_size=16,
+    shapes=((2,),),
+    dtypes=(torch.float32,),
+    num_speculative_blocks=0,
+    page_size_padded=None,
+):
+    return _DummyRequestConstantSpec(
+        block_size=block_size,
+        shapes=shapes,
+        dtypes=dtypes,
+        num_speculative_blocks=num_speculative_blocks,
+        page_size_padded=page_size_padded,
+    )
+
+
+def assert_legacy_single_pool_metadata(config: KVCacheConfig) -> None:
+    if len(config.kv_cache_groups) == 0:
+        assert config.pool_configs == ()
+        assert config.group_to_pool_id == ()
+        assert config.num_blocks == 1
+        return
+
+    assert len(config.pool_configs) == 1
+    pool_config = config.pool_configs[0]
+    assert pool_config.pool_id == 0
+    assert pool_config.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    assert pool_config.group_ids == tuple(range(len(config.kv_cache_groups)))
+    assert config.group_to_pool_id == tuple(0 for _ in config.kv_cache_groups)
+    assert pool_config.num_blocks == config.num_blocks
+    assert config.num_blocks == sum(pool.num_blocks for pool in config.pool_configs)
+
+    accounting_page_sizes = {
+        group.kv_cache_spec.accounting_page_size_bytes
+        for group in config.kv_cache_groups
+    }
+    physical_page_sizes = {
+        group.kv_cache_spec.physical_page_size_bytes for group in config.kv_cache_groups
+    }
+    assert len(accounting_page_sizes) == 1
+    accounting_page_size = accounting_page_sizes.pop()
+    assert pool_config.accounting_page_size_bytes == accounting_page_size
+    if len(physical_page_sizes) == 1:
+        assert pool_config.physical_page_size_bytes == physical_page_sizes.pop()
+    else:
+        assert pool_config.physical_page_size_bytes == accounting_page_size
+
+
+def test_legacy_pool_metadata_keeps_mixed_memory_models_shared():
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    mamba_spec = new_mamba_spec(
+        block_size=4,
+        shapes=((4,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="none",
+    )
+
+    config = KVCacheConfig(
+        num_blocks=7,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["attn"], attention_spec),
+            KVCacheGroupSpec(["mamba"], mamba_spec),
+        ],
+    )
+
+    assert config.group_to_pool_id == (0, 0)
+    assert config.pool_configs == (
+        KVCachePoolConfig(
+            pool_id=0,
+            memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+            group_ids=(0, 1),
+            num_blocks=7,
+            accounting_page_size_bytes=max(
+                attention_spec.accounting_page_size_bytes,
+                mamba_spec.accounting_page_size_bytes,
+            ),
+            physical_page_size_bytes=max(
+                attention_spec.accounting_page_size_bytes,
+                mamba_spec.accounting_page_size_bytes,
+            ),
+        ),
+    )
+
+
+def test_legacy_pool_metadata_keeps_different_page_sizes_shared():
+    small_spec = FullAttentionSpec(
+        block_size=12,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    large_spec = FullAttentionSpec(
+        block_size=16,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+
+    config = KVCacheConfig(
+        num_blocks=11,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["small"], small_spec),
+            KVCacheGroupSpec(["large"], large_spec),
+        ],
+    )
+
+    assert config.group_to_pool_id == (0, 0)
+    assert config.pool_configs == (
+        KVCachePoolConfig(
+            pool_id=0,
+            memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+            group_ids=(0, 1),
+            num_blocks=11,
+            accounting_page_size_bytes=max(
+                small_spec.accounting_page_size_bytes,
+                large_spec.accounting_page_size_bytes,
+            ),
+            physical_page_size_bytes=max(
+                small_spec.accounting_page_size_bytes,
+                large_spec.accounting_page_size_bytes,
+            ),
+        ),
+    )
+
+
 @pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
 def test_none_hash(monkeypatch, hash_fn):
     import vllm.v1.core.kv_cache_utils
@@ -1779,6 +1930,579 @@ def test_get_kv_cache_configs_attention_free():
             kv_cache_groups=[],
         )
     ]
+    assert_legacy_single_pool_metadata(kv_cache_configs[0])
+
+
+def test_kv_cache_config_legacy_pool_metadata_single_group():
+    spec = new_kv_cache_spec()
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[
+            KVCacheTensor(size=spec.page_size_bytes * 10, shared_by=["layer_1"]),
+            KVCacheTensor(size=spec.page_size_bytes * 10, shared_by=["layer_2"]),
+        ],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1", "layer_2"], spec)],
+    )
+
+    assert_legacy_single_pool_metadata(config)
+
+
+def test_kv_cache_config_legacy_pool_metadata_multi_group():
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_sliding_window_spec(),
+    }
+
+    config = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
+    )[0]
+
+    assert len(config.kv_cache_groups) == 2
+    assert_legacy_single_pool_metadata(config)
+
+
+def test_kv_cache_config_legacy_pool_metadata_mixed_physical_page_sizes():
+    unpadded_mamba_spec = new_mamba_spec(mamba_cache_mode="all")
+    unified_page_size = unpadded_mamba_spec.physical_page_size_bytes + 1024
+    attention_spec = new_kv_cache_spec(page_size_padded=unified_page_size)
+    mamba_spec = new_mamba_spec(
+        page_size_padded=unified_page_size,
+        mamba_cache_mode="all",
+    )
+    assert (
+        attention_spec.physical_page_size_bytes != mamba_spec.physical_page_size_bytes
+    )
+
+    config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], attention_spec),
+            KVCacheGroupSpec(["layer_2"], mamba_spec),
+        ],
+    )
+
+    assert_legacy_single_pool_metadata(config)
+
+
+def test_kv_cache_config_pool_metadata_tracks_worker_min_blocks():
+    model_config = ModelConfig(max_model_len=16)
+    vllm_config = VllmConfig(model_config=model_config)
+    spec = new_kv_cache_spec()
+    kv_cache_specs = [
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+        {
+            "layer1": new_kv_cache_spec(),
+            "layer2": new_kv_cache_spec(),
+        },
+    ]
+
+    kv_cache_configs = get_kv_cache_configs(
+        vllm_config,
+        kv_cache_specs,
+        [
+            spec.page_size_bytes * 2 * 10,
+            spec.page_size_bytes * 2 * 20,
+        ],
+    )
+
+    for config in kv_cache_configs:
+        assert config.num_blocks == 10
+        assert_legacy_single_pool_metadata(config)
+
+
+def make_request_constant_vllm_config(
+    max_model_len: int = 16,
+    max_num_seqs: int = 4,
+) -> VllmConfig:
+    model_config = ModelConfig(max_model_len=max_model_len)
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+    vllm_config.cache_config.enable_prefix_caching = False
+    return vllm_config
+
+
+def test_real_mamba_spec_none_mode_is_request_constant():
+    spec = new_mamba_spec(
+        mamba_cache_mode="none",
+        num_speculative_blocks=2,
+    )
+
+    assert spec.memory_model == MemoryModel.REQUEST_CONSTANT
+    assert spec.blocks_per_request == 3
+    assert spec.physical_page_size_bytes == spec.page_size_bytes
+
+
+def test_real_mamba_spec_align_mode_blocks_per_request():
+    spec = new_mamba_spec(
+        mamba_cache_mode="align",
+        num_speculative_blocks=2,
+    )
+
+    assert spec.memory_model == MemoryModel.REQUEST_CONSTANT
+    assert spec.blocks_per_request == 4
+
+
+def test_real_mamba_spec_all_mode_is_token_proportional():
+    spec = new_mamba_spec(
+        mamba_cache_mode="all",
+        num_speculative_blocks=2,
+    )
+
+    assert spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    assert spec.blocks_per_request == 3
+
+
+def test_hybrid_qwen_like_config_generates_multi_pool():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    mamba_spec = new_mamba_spec(
+        block_size=4,
+        shapes=((4,),),
+        dtypes=(torch.float32,),
+        num_speculative_blocks=0,
+        mamba_cache_mode="none",
+        page_size_padded=attention_spec.page_size_bytes,
+    )
+    mamba_num_blocks = 4 * mamba_spec.blocks_per_request + 1
+    mamba_reserved_bytes = mamba_num_blocks * mamba_spec.physical_page_size_bytes
+    available_memory = mamba_reserved_bytes + attention_spec.page_size_bytes * 20
+
+    config = get_kv_cache_configs(
+        vllm_config,
+        [{"attn": attention_spec, "mamba": mamba_spec}],
+        [available_memory],
+    )[0]
+
+    assert config.kv_cache_groups == [
+        KVCacheGroupSpec(["attn"], attention_spec),
+        KVCacheGroupSpec(["mamba"], mamba_spec),
+    ]
+    assert config.group_to_pool_id == (0, 1)
+    assert config.pool_configs[0].memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    assert config.pool_configs[0].num_blocks == 20
+    assert config.pool_configs[1].memory_model == MemoryModel.REQUEST_CONSTANT
+    assert config.pool_configs[1].num_blocks == mamba_num_blocks
+    assert config.kv_cache_tensors == [
+        KVCacheTensor(size=attention_spec.page_size_bytes * 20, shared_by=["attn"]),
+        KVCacheTensor(size=mamba_reserved_bytes, shared_by=["mamba"]),
+    ]
+
+
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_real_mamba_spec_all_mode_keeps_shared_pool(enable_prefix_caching):
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    vllm_config.cache_config.enable_prefix_caching = enable_prefix_caching
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        page_size_padded=64,
+    )
+    mamba_spec = new_mamba_spec(
+        block_size=4,
+        shapes=((4,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="all",
+        page_size_padded=64,
+    )
+
+    config = get_kv_cache_configs(
+        vllm_config,
+        [{"attn": attention_spec, "mamba": mamba_spec}],
+        [64 * 10],
+    )[0]
+
+    assert config.group_to_pool_id == (0, 0)
+    assert len(config.pool_configs) == 1
+    assert config.pool_configs[0].memory_model == MemoryModel.TOKEN_PROPORTIONAL
+
+
+def test_token_proportional_capacity_ignores_request_constant_pool():
+    vllm_config = make_request_constant_vllm_config(
+        max_model_len=16,
+        max_num_seqs=4,
+    )
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    mamba_spec = new_mamba_spec(
+        block_size=4,
+        shapes=((4,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="none",
+        page_size_padded=attention_spec.page_size_bytes,
+    )
+    mamba_num_blocks = 4 * mamba_spec.blocks_per_request + 1
+    mamba_reserved_bytes = mamba_num_blocks * mamba_spec.physical_page_size_bytes
+
+    config = get_kv_cache_configs(
+        vllm_config,
+        [{"attn": attention_spec, "mamba": mamba_spec}],
+        [mamba_reserved_bytes + attention_spec.page_size_bytes * 16],
+    )[0]
+
+    assert get_token_proportional_kv_cache_capacity_tokens(config) == 64
+    assert get_max_concurrency_for_kv_cache_config(vllm_config, config) == 4
+
+
+def _make_request_constant_mamba_cudagraph_config(
+    max_num_seqs: int = 4,
+    mamba_cache_mode: str = "none",
+    num_speculative_blocks: int = 2,
+) -> KVCacheConfig:
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=max_num_seqs)
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    mamba_spec = new_mamba_spec(
+        block_size=4,
+        shapes=((4,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode=mamba_cache_mode,
+        num_speculative_blocks=num_speculative_blocks,
+        page_size_padded=attention_spec.page_size_bytes,
+    )
+    mamba_num_blocks = max_num_seqs * mamba_spec.blocks_per_request + 1
+    mamba_reserved_bytes = mamba_num_blocks * mamba_spec.physical_page_size_bytes
+    return get_kv_cache_configs(
+        vllm_config,
+        [{"attn": attention_spec, "mamba": mamba_spec}],
+        [mamba_reserved_bytes + attention_spec.page_size_bytes * 16],
+    )[0]
+
+
+def test_request_constant_mamba_full_cudagraph_uses_pool_capacity():
+    kv_cache_config = _make_request_constant_mamba_cudagraph_config()
+    compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.FULL)
+
+    cudagraph_mode = compilation_config.resolve_cudagraph_mode_and_sizes(
+        min_cg_support=AttentionCGSupport.ALWAYS,
+        min_cg_attn_backend="test",
+        kv_cache_config=kv_cache_config,
+        max_num_reqs=4,
+    )
+
+    assert cudagraph_mode == CUDAGraphMode.FULL
+
+
+def test_request_constant_mamba_full_cudagraph_rejects_small_pool():
+    kv_cache_config = _make_request_constant_mamba_cudagraph_config()
+    compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.FULL)
+
+    with pytest.raises(
+        ValueError,
+        match="REQUEST_CONSTANT KV cache blocks",
+    ):
+        compilation_config.resolve_cudagraph_mode_and_sizes(
+            min_cg_support=AttentionCGSupport.ALWAYS,
+            min_cg_attn_backend="test",
+            kv_cache_config=kv_cache_config,
+            max_num_reqs=5,
+        )
+
+
+def test_request_constant_mamba_full_cudagraph_align_uses_blocks_per_request():
+    kv_cache_config = _make_request_constant_mamba_cudagraph_config(
+        mamba_cache_mode="align",
+        num_speculative_blocks=1,
+    )
+    compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.FULL)
+
+    cudagraph_mode = compilation_config.resolve_cudagraph_mode_and_sizes(
+        min_cg_support=AttentionCGSupport.ALWAYS,
+        min_cg_attn_backend="test",
+        kv_cache_config=kv_cache_config,
+        max_num_reqs=4,
+    )
+
+    assert cudagraph_mode == CUDAGraphMode.FULL
+
+
+def test_request_constant_mamba_full_cudagraph_skips_profiling_capacity():
+    kv_cache_config = _make_request_constant_mamba_cudagraph_config()
+    compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.FULL)
+
+    cudagraph_mode = compilation_config.resolve_cudagraph_mode_and_sizes(
+        min_cg_support=AttentionCGSupport.ALWAYS,
+        min_cg_attn_backend="test",
+        kv_cache_config=kv_cache_config,
+        max_num_reqs=5,
+        is_profiling=True,
+    )
+
+    assert cudagraph_mode == CUDAGraphMode.FULL
+
+
+def test_request_constant_mamba_full_cudagraph_requires_max_num_reqs():
+    kv_cache_config = _make_request_constant_mamba_cudagraph_config()
+    compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.FULL)
+
+    with pytest.raises(
+        ValueError,
+        match="requires max_num_seqs for capacity validation",
+    ):
+        compilation_config.resolve_cudagraph_mode_and_sizes(
+            min_cg_support=AttentionCGSupport.ALWAYS,
+            min_cg_attn_backend="test",
+            kv_cache_config=kv_cache_config,
+            max_num_reqs=None,
+        )
+
+
+def test_mixed_memory_model_config_reserves_request_constant_pool():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    request_constant_spec = new_request_constant_spec(
+        num_speculative_blocks=1,
+        page_size_padded=16,
+    )
+    request_constant_num_blocks = 4 * request_constant_spec.blocks_per_request + 1
+    reserved_bytes = (
+        request_constant_num_blocks * request_constant_spec.physical_page_size_bytes
+    )
+    available_memory = reserved_bytes + attention_spec.page_size_bytes * 10
+
+    config = get_kv_cache_configs(
+        vllm_config,
+        [{"attn": attention_spec, "state": request_constant_spec}],
+        [available_memory],
+    )[0]
+
+    assert config.num_blocks == 10
+    assert config.kv_cache_groups == [
+        KVCacheGroupSpec(["attn"], attention_spec),
+        KVCacheGroupSpec(["state"], request_constant_spec),
+    ]
+    assert config.kv_cache_tensors == [
+        KVCacheTensor(size=attention_spec.page_size_bytes * 10, shared_by=["attn"]),
+        KVCacheTensor(size=reserved_bytes, shared_by=["state"]),
+    ]
+    assert config.group_to_pool_id == (0, 1)
+    assert config.pool_configs == (
+        KVCachePoolConfig(
+            pool_id=0,
+            memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+            group_ids=(0,),
+            num_blocks=10,
+            accounting_page_size_bytes=attention_spec.accounting_page_size_bytes,
+            physical_page_size_bytes=attention_spec.physical_page_size_bytes,
+        ),
+        KVCachePoolConfig(
+            pool_id=1,
+            memory_model=MemoryModel.REQUEST_CONSTANT,
+            group_ids=(1,),
+            num_blocks=request_constant_num_blocks,
+            accounting_page_size_bytes=(
+                request_constant_spec.accounting_page_size_bytes
+            ),
+            physical_page_size_bytes=request_constant_spec.physical_page_size_bytes,
+        ),
+    )
+
+
+def test_multi_pool_config_deterministic_across_workers():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    request_constant_spec = new_request_constant_spec(
+        num_speculative_blocks=1,
+        page_size_padded=16,
+    )
+    request_constant_num_blocks = 4 * request_constant_spec.blocks_per_request + 1
+    reserved_bytes = (
+        request_constant_num_blocks * request_constant_spec.physical_page_size_bytes
+    )
+
+    kv_cache_configs = get_kv_cache_configs(
+        vllm_config,
+        [
+            {"attn": attention_spec, "state": request_constant_spec},
+            {"attn": attention_spec, "state": request_constant_spec},
+        ],
+        [
+            reserved_bytes + attention_spec.page_size_bytes * 10,
+            reserved_bytes + attention_spec.page_size_bytes * 20,
+        ],
+    )
+
+    for config in kv_cache_configs:
+        assert config.num_blocks == 10
+        assert config.pool_configs[0].num_blocks == 10
+        assert config.pool_configs[1].num_blocks == request_constant_num_blocks
+        assert config.kv_cache_tensors == [
+            KVCacheTensor(size=attention_spec.page_size_bytes * 10, shared_by=["attn"]),
+            KVCacheTensor(size=reserved_bytes, shared_by=["state"]),
+        ]
+
+    scheduler_config = generate_scheduler_kv_cache_config(kv_cache_configs)
+    assert scheduler_config.pool_configs == kv_cache_configs[0].pool_configs
+    assert scheduler_config.group_to_pool_id == (0, 1)
+
+
+def test_request_constant_prefix_caching_fails_early():
+    vllm_config = make_request_constant_vllm_config()
+    vllm_config.cache_config.enable_prefix_caching = True
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    request_constant_spec = new_request_constant_spec()
+    reserved_bytes = (
+        vllm_config.scheduler_config.max_num_seqs
+        * request_constant_spec.blocks_per_request
+        + 1
+    ) * request_constant_spec.physical_page_size_bytes
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Prefix caching with REQUEST_CONSTANT groups",
+    ):
+        get_kv_cache_configs(
+            vllm_config,
+            [{"attn": attention_spec, "state": request_constant_spec}],
+            [reserved_bytes + attention_spec.page_size_bytes * 10],
+        )
+
+
+def test_request_constant_reservation_fails_closed_when_memory_exhausted():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    request_constant_spec = new_request_constant_spec(
+        num_speculative_blocks=1,
+        page_size_padded=16,
+    )
+    request_constant_num_blocks = 4 * request_constant_spec.blocks_per_request + 1
+    reserved_bytes = (
+        request_constant_num_blocks * request_constant_spec.physical_page_size_bytes
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="REQUEST_CONSTANT KV cache reservation",
+    ):
+        kv_cache_utils.get_kv_cache_config_from_groups(
+            vllm_config,
+            [
+                KVCacheGroupSpec(["attn"], attention_spec),
+                KVCacheGroupSpec(["state"], request_constant_spec),
+            ],
+            available_memory=reserved_bytes,
+        )
+
+
+def test_request_constant_num_blocks_override_allows_minimal_config():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    vllm_config.cache_config.num_gpu_blocks_override = 1
+    attention_spec = new_kv_cache_spec(
+        block_size=4,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    request_constant_spec = new_request_constant_spec(
+        num_speculative_blocks=1,
+        page_size_padded=16,
+    )
+    request_constant_num_blocks = 4 * request_constant_spec.blocks_per_request + 1
+    reserved_bytes = (
+        request_constant_num_blocks * request_constant_spec.physical_page_size_bytes
+    )
+
+    config = kv_cache_utils.get_kv_cache_config_from_groups(
+        vllm_config,
+        [
+            KVCacheGroupSpec(["attn"], attention_spec),
+            KVCacheGroupSpec(["state"], request_constant_spec),
+        ],
+        available_memory=0,
+    )
+
+    assert config.num_blocks == 1
+    assert config.pool_configs[0].num_blocks == 1
+    assert config.pool_configs[1].num_blocks == request_constant_num_blocks
+    assert config.kv_cache_tensors == [
+        KVCacheTensor(size=attention_spec.page_size_bytes, shared_by=["attn"]),
+        KVCacheTensor(size=reserved_bytes, shared_by=["state"]),
+    ]
+
+
+def test_request_constant_only_num_blocks_override_allows_minimal_config():
+    vllm_config = make_request_constant_vllm_config(max_num_seqs=4)
+    vllm_config.cache_config.num_gpu_blocks_override = 1
+    request_constant_spec = new_request_constant_spec(
+        num_speculative_blocks=1,
+        page_size_padded=16,
+    )
+    request_constant_num_blocks = 4 * request_constant_spec.blocks_per_request + 1
+    reserved_bytes = (
+        request_constant_num_blocks * request_constant_spec.physical_page_size_bytes
+    )
+
+    config = kv_cache_utils.get_kv_cache_config_from_groups(
+        vllm_config,
+        [KVCacheGroupSpec(["state"], request_constant_spec)],
+        available_memory=0,
+    )
+
+    assert config.num_blocks == request_constant_num_blocks
+    assert config.group_to_pool_id == (0,)
+    assert config.pool_configs == (
+        KVCachePoolConfig(
+            pool_id=0,
+            memory_model=MemoryModel.REQUEST_CONSTANT,
+            group_ids=(0,),
+            num_blocks=request_constant_num_blocks,
+            accounting_page_size_bytes=(
+                request_constant_spec.accounting_page_size_bytes
+            ),
+            physical_page_size_bytes=request_constant_spec.physical_page_size_bytes,
+        ),
+    )
+    assert config.kv_cache_tensors == [
+        KVCacheTensor(size=reserved_bytes, shared_by=["state"]),
+    ]
 
 
 def test_generate_uniform_type_kv_cache_specs():
@@ -1854,6 +2578,36 @@ def test_generate_scheduler_kv_cache_config():
     )
 
 
+def test_generate_scheduler_kv_cache_config_rejects_mismatched_pool_schema():
+    spec = new_kv_cache_spec()
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1"], spec)],
+    )
+    mismatched_kv_cache_config = KVCacheConfig(
+        num_blocks=10,
+        kv_cache_tensors=[],
+        kv_cache_groups=[KVCacheGroupSpec(["layer_1"], spec)],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=10,
+                accounting_page_size_bytes=spec.accounting_page_size_bytes + 1,
+                physical_page_size_bytes=spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0,),
+    )
+
+    with pytest.raises(AssertionError):
+        generate_scheduler_kv_cache_config(
+            [kv_cache_config, mismatched_kv_cache_config]
+        )
+
+
 def new_mla_spec(cache_dtype_str=None):
     # head_size = kv_lora_rank(512) + qk_rope_head_dim(64) = 576
     return MLAAttentionSpec(
@@ -2039,7 +2793,16 @@ def test_auto_fit_max_model_len_with_hybrid():
     model_config = ModelConfig(max_model_len=8192)
     # Simulate the user passing -1 by setting original_max_model_len
     model_config.original_max_model_len = -1
-    vllm_config = VllmConfig(model_config=model_config)
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=1,
+        max_model_len=model_config.max_model_len,
+        is_encoder_decoder=model_config.is_encoder_decoder,
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+    vllm_config.cache_config.enable_prefix_caching = False
 
     mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
     gamma = 2
@@ -2048,7 +2811,9 @@ def test_auto_fit_max_model_len_with_hybrid():
         "layer_2": new_kv_cache_spec(),
     }
 
-    available_memory = mem_per_block_per_layer * (1024 // 16 + 1 + gamma)
+    # 64 attention blocks for 1024 tokens plus 3 compact Mamba blocks for the
+    # single request and 1 compact-pool null block.
+    available_memory = mem_per_block_per_layer * (1024 // 16 + 1 + gamma + 1)
     _kv_cache_configs = get_kv_cache_configs(
         vllm_config, [kv_cache_specs], [available_memory]
     )
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index c35c38911a1a..ef7f0b93e438 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -35,7 +35,9 @@
     FullAttentionSpec,
     KVCacheConfig,
     KVCacheGroupSpec,
+    KVCachePoolConfig,
     MambaSpec,
+    MemoryModel,
     SlidingWindowSpec,
 )
 
@@ -125,8 +127,9 @@ def make_kv_cache_config_hybrid_model(
     elif second_spec_type == "mamba":
         second_spec = MambaSpec(
             block_size=block_size,
-            shapes=(1, 1),
+            shapes=((1, 1),),
             dtypes=(torch.float32,),
+            page_size_padded=8 * block_size,
         )
 
     return KVCacheConfig(
@@ -160,8 +163,9 @@ def make_kv_cache_config_three_types(
     if third_spec_type == "mamba":
         third_spec = MambaSpec(
             block_size=block_size,
-            shapes=(1, 1),
+            shapes=((1, 1),),
             dtypes=(torch.float32,),
+            page_size_padded=8 * block_size,
         )
     elif third_spec_type == "sliding_window":
         third_spec = SlidingWindowSpec(
@@ -740,13 +744,19 @@ def _make_hybrid_kv_cache_config(
         ),
         "mamba": lambda: MambaSpec(
             block_size=block_size,
-            shapes=(1, 1),
+            shapes=((1, 1),),
             dtypes=(torch.float32,),
+            page_size_padded=8 * block_size,
+            # Prefix-caching tests exercise the legacy shared-pool Mamba path.
+            # Non-"all" Mamba modes are REQUEST_CONSTANT and intentionally
+            # fail-closed with prefix caching.
+            mamba_cache_mode="all",
         ),
         "mamba_align": lambda: MambaSpec(
             block_size=block_size,
-            shapes=(1, 1),
+            shapes=((1, 1),),
             dtypes=(torch.float32,),
+            page_size_padded=8 * block_size,
             mamba_cache_mode="align",
         ),
     }
@@ -967,44 +977,61 @@ def test_prefill_hybrid_model_combinations_eagle(
     manager.free(req1)
 
 
-def test_prefill_hybrid_model_mamba_align():
-    """Test that MambaManager.cache_blocks() handles null blocks in align mode.
-
-    Regression test for https://github.com/vllm-project/vllm/issues/34361.
-    In mamba_cache_mode="align", allocate_new_blocks() pads req_to_blocks with
-    null blocks. cache_full_blocks() correctly skips them, but
-    MambaManager.cache_blocks() must also skip null blocks when tracking
-    cached_blocks_this_step.
-    """
+def test_prefill_hybrid_model_mamba_align_prefix_caching_rejected():
+    """mamba_cache_mode="align" is REQUEST_CONSTANT and not prefix-cacheable."""
     block_size = 16
     num_blocks = 30
-
-    kv_cache_config = _make_hybrid_kv_cache_config(
-        block_size, num_blocks, ["full", "mamba_align"]
+    full_spec = FullAttentionSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
     )
-    manager = KVCacheManager(
-        kv_cache_config,
-        max_model_len=8192,
-        enable_caching=True,
-        hash_block_size=block_size,
+    mamba_spec = MambaSpec(
+        block_size=block_size,
+        shapes=((1, 1),),
+        dtypes=(torch.float32,),
+        page_size_padded=8 * block_size,
+        mamba_cache_mode="align",
     )
-
-    hash_fn = sha256
-
-    # 3 full blocks (48 tokens) + 7 partial tokens = 55 tokens total
-    all_token_ids = [i for i in range(3) for _ in range(block_size)] + [3] * 7
-
-    # First request: allocate_slots should not crash with the assertion error
-    # in MambaManager.cache_blocks() when null blocks are present.
-    req0 = make_request("0", all_token_ids, block_size, hash_fn)
-    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert num_computed_tokens == 0
-
-    blocks = manager.allocate_slots(req0, 55, num_computed_tokens, computed_blocks)
-    assert blocks is not None
-    assert len(blocks.get_block_ids()) == 2  # full_attn + mamba groups
-
-    manager.free(req0)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer0"], full_spec),
+            KVCacheGroupSpec(["layer1"], mamba_spec),
+        ],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=num_blocks,
+                accounting_page_size_bytes=full_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=full_spec.physical_page_size_bytes,
+            ),
+            KVCachePoolConfig(
+                pool_id=1,
+                memory_model=MemoryModel.REQUEST_CONSTANT,
+                group_ids=(1,),
+                num_blocks=3,
+                accounting_page_size_bytes=mamba_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=mamba_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0, 1),
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="multi-pool configs only when prefix caching is disabled",
+    ):
+        KVCacheManager(
+            kv_cache_config,
+            max_model_len=8192,
+            enable_caching=True,
+            hash_block_size=block_size,
+        )
 
 
 def test_prefill_plp():
@@ -1965,6 +1992,42 @@ def test_kv_cache_events(blocks_to_cache: int):
     assert len(manager.block_pool.cached_block_hash_to_block) == 0
 
 
+def test_request_constant_only_kv_cache_events_noop():
+    block_size = 4
+    mamba_spec = MambaSpec(
+        block_size=block_size,
+        shapes=((1,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="none",
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=3,
+        kv_cache_tensors=[],
+        kv_cache_groups=[KVCacheGroupSpec(["mamba"], mamba_spec)],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.REQUEST_CONSTANT,
+                group_ids=(0,),
+                num_blocks=3,
+                accounting_page_size_bytes=mamba_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=mamba_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0,),
+    )
+
+    manager = KVCacheManager(
+        kv_cache_config,
+        max_model_len=8192,
+        enable_caching=False,
+        enable_kv_cache_events=True,
+        hash_block_size=block_size,
+    )
+
+    assert manager.take_events() == []
+
+
 def test_null_parent_block_hash():
     block_size = 1
     num_cached_blocks = 2
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index f59830dcd741..cafaf849f24b 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.block_pool import BlockPool, CompactBlockPool
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     KVCacheBlock,
@@ -14,9 +14,17 @@
 )
 from vllm.v1.core.single_type_kv_cache_manager import (
     ChunkedLocalAttentionManager,
+    FullAttentionManager,
+    MambaManager,
     SlidingWindowManager,
 )
-from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, SlidingWindowSpec
+from vllm.v1.kv_cache_interface import (
+    ChunkedLocalAttentionSpec,
+    FullAttentionSpec,
+    MambaSpec,
+    SlidingWindowSpec,
+    TQFullAttentionSpec,
+)
 
 pytestmark = pytest.mark.cpu_test
 
@@ -44,6 +52,205 @@ def get_chunked_local_attention_manager(
     )
 
 
+@pytest.mark.parametrize(
+    ("kv_cache_spec", "manager_cls", "should_record"),
+    [
+        (
+            FullAttentionSpec(
+                block_size=2,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+            ),
+            FullAttentionManager,
+            True,
+        ),
+        (
+            TQFullAttentionSpec(
+                block_size=2,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+                tq_slot_size=1,
+            ),
+            FullAttentionManager,
+            True,
+        ),
+        (
+            SlidingWindowSpec(
+                block_size=2,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+                sliding_window=4,
+            ),
+            SlidingWindowManager,
+            False,
+        ),
+        (
+            ChunkedLocalAttentionSpec(
+                block_size=2,
+                num_kv_heads=1,
+                head_size=1,
+                dtype=torch.float32,
+                attention_chunk_size=4,
+            ),
+            ChunkedLocalAttentionManager,
+            False,
+        ),
+        (
+            MambaSpec(
+                block_size=2,
+                shapes=((1,),),
+                dtypes=(torch.float32,),
+            ),
+            MambaManager,
+            False,
+        ),
+    ],
+)
+def test_legacy_new_block_ids_for_zeroing_behavior(
+    kv_cache_spec, manager_cls, should_record
+):
+    block_pool = BlockPool(
+        num_gpu_blocks=10,
+        enable_caching=False,
+        hash_block_size=kv_cache_spec.block_size,
+    )
+    manager = manager_cls(
+        kv_cache_spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+    )
+
+    new_blocks = manager.allocate_new_blocks(
+        request_id="request",
+        num_tokens=4,
+        num_tokens_main_model=4,
+    )
+
+    if should_record:
+        assert manager.take_new_block_ids() == [b.block_id for b in new_blocks]
+    else:
+        assert manager.take_new_block_ids() == []
+    assert manager.take_new_block_ids() == []
+
+
+def test_mamba_manager_accepts_allocation_only_pool_when_caching_disabled():
+    kv_cache_spec = MambaSpec(
+        block_size=2,
+        shapes=((1,),),
+        dtypes=(torch.float32,),
+    )
+    block_pool = CompactBlockPool(num_allocatable=2)
+    manager = MambaManager(
+        kv_cache_spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+    )
+
+    new_blocks = manager.allocate_new_blocks(
+        request_id="request",
+        num_tokens=4,
+        num_tokens_main_model=4,
+    )
+
+    assert [block.block_id for block in new_blocks] == [2]
+    assert block_pool.get_num_free_blocks() == 1
+    manager.free("request")
+    assert block_pool.get_num_free_blocks() == 2
+
+
+def test_mamba_manager_request_constant_none_allocates_once():
+    kv_cache_spec = MambaSpec(
+        block_size=2,
+        shapes=((1,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="none",
+        num_speculative_blocks=1,
+    )
+    block_pool = CompactBlockPool(num_allocatable=4)
+    manager = MambaManager(
+        kv_cache_spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+    )
+
+    assert (
+        manager.get_num_blocks_to_allocate(
+            "request",
+            num_tokens=100,
+            new_computed_blocks=[],
+            total_computed_tokens=0,
+            num_tokens_main_model=100,
+        )
+        == kv_cache_spec.blocks_per_request
+    )
+    new_blocks = manager.allocate_new_blocks(
+        request_id="request",
+        num_tokens=100,
+        num_tokens_main_model=100,
+    )
+
+    assert len(new_blocks) == kv_cache_spec.blocks_per_request
+    assert all(block.block_id != 0 for block in new_blocks)
+    assert block_pool.get_num_free_blocks() == 2
+    assert (
+        manager.get_num_blocks_to_allocate(
+            "request",
+            num_tokens=200,
+            new_computed_blocks=[],
+            total_computed_tokens=100,
+            num_tokens_main_model=200,
+        )
+        == 0
+    )
+    assert (
+        manager.allocate_new_blocks(
+            request_id="request",
+            num_tokens=200,
+            num_tokens_main_model=200,
+        )
+        == []
+    )
+
+    manager.remove_skipped_blocks("request", num_computed_tokens=100)
+    assert block_pool.get_num_free_blocks() == 2
+    manager.free("request")
+    assert block_pool.get_num_free_blocks() == 4
+
+
+def test_mamba_manager_request_constant_align_free_filters_null_blocks():
+    kv_cache_spec = MambaSpec(
+        block_size=2,
+        shapes=((1,),),
+        dtypes=(torch.float32,),
+        mamba_cache_mode="align",
+    )
+    block_pool = CompactBlockPool(num_allocatable=4)
+    manager = MambaManager(
+        kv_cache_spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+    )
+
+    new_blocks = manager.allocate_new_blocks(
+        request_id="request",
+        num_tokens=6,
+        num_tokens_main_model=6,
+    )
+
+    assert len(new_blocks) == 3
+    assert sum(not block.is_null for block in manager.req_to_blocks["request"]) == 1
+    assert block_pool.get_num_free_blocks() == 3
+    manager.free("request")
+    assert block_pool.get_num_free_blocks() == 4
+
+
 def test_chunked_local_attention_possible_cached_prefix():
     block_size = 2
     chunked_local_attention_spec = ChunkedLocalAttentionSpec(
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 66e6d7dd4605..97b5fd46a2eb 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -371,193 +371,200 @@ def test_bypass_on_mode_none(self):
         assert not wrapper.concrete_cudagraph_entries
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
-class TestCudagraphIntegration:
-    def setup_method(self):
-        # only FULL mode for non-uniform batches
-        self.comp_config = CompilationConfig(
-            mode=CompilationMode.VLLM_COMPILE,
-            cudagraph_mode="FULL",
-            cudagraph_capture_sizes=[10, 20],
-        )
-        self.vllm_config = _create_vllm_config(self.comp_config)
-        self.dispatcher = CudagraphDispatcher(self.vllm_config)
-        self.dispatcher.initialize_cudagraph_keys(
-            self.comp_config.cudagraph_mode, uniform_decode_query_len=1
-        )
-
-    def _run_and_monitor_call(
-        self, wrapper, input_tensor, runtime_mode, batch_descriptor
+def _run_and_monitor_call(
+    wrapper, input_tensor, runtime_mode, batch_descriptor, vllm_config
+):
+    """Helper to run a single call and monitor the action."""
+
+    with (
+        patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_graph_context,
+        patch.object(wrapper, "runnable", wraps=wrapper.runnable) as mock_runnable,
     ):
-        """Helper to run a single call and monitor the action."""
-
-        with (
-            patch("torch.cuda.graph", wraps=torch.cuda.graph) as mock_graph_context,
-            patch.object(wrapper, "runnable", wraps=wrapper.runnable) as mock_runnable,
-        ):
-            entry = wrapper.concrete_cudagraph_entries.get(batch_descriptor, None)
-
-            context = set_forward_context(
-                attn_metadata=None,
-                vllm_config=self.vllm_config,
-                cudagraph_runtime_mode=runtime_mode,
-                batch_descriptor=batch_descriptor,
-            )
-            mock_replay = MagicMock()
-            if entry and entry.cudagraph:
-                with (
-                    context,
-                    patch.object(
-                        entry.cudagraph, "replay", new_callable=MagicMock
-                    ) as mock_replay,
-                ):
-                    wrapper(input_tensor)
-            else:
-                with context:
-                    wrapper(input_tensor)
-
-            if mock_graph_context.called:
-                # note that this is globally mocked, so it will be detected
-                # even whether called by the inner or outer wrapper
-                return "capture_global"
-            if mock_replay.called:
-                # only for outer wrapper
-                return "replay"
-            if mock_runnable.call_count > 0:
-                # only for outer wrapper
-                return "bypass"
-            return "unknown"
-
-    @create_new_process_for_each_test("spawn")
-    def test_capture_replay_bypass_logic(self):
-        model = SimpleMLP().to(DEVICE_TYPE)
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
-        max_bs = 16
-        persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE)
-        input_1 = persistent_input_buffer[:1]
-        input_2 = persistent_input_buffer[:2]
-        input_3 = persistent_input_buffer[:3]
-
-        desc_1 = BatchDescriptor(num_tokens=1)
-        desc_2 = BatchDescriptor(num_tokens=2)
-        desc_3_unseen = BatchDescriptor(num_tokens=3)
+        entry = wrapper.concrete_cudagraph_entries.get(batch_descriptor, None)
 
-        # 0. global warmup
-        with set_forward_context(
+        context = set_forward_context(
             attn_metadata=None,
-            vllm_config=self.vllm_config,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            batch_descriptor=None,
-        ):
-            full_wrapper(input_1)
+            vllm_config=vllm_config,
+            cudagraph_runtime_mode=runtime_mode,
+            batch_descriptor=batch_descriptor,
+        )
+        mock_replay = MagicMock()
+        if entry and entry.cudagraph:
+            with (
+                context,
+                patch.object(
+                    entry.cudagraph, "replay", new_callable=MagicMock
+                ) as mock_replay,
+            ):
+                wrapper(input_tensor)
+        else:
+            with context:
+                wrapper(input_tensor)
+
+        if mock_graph_context.called:
+            # note that this is globally mocked, so it will be detected
+            # even whether called by the inner or outer wrapper
+            return "capture_global"
+        if mock_replay.called:
+            # only for outer wrapper
+            return "replay"
+        if mock_runnable.call_count > 0:
+            # only for outer wrapper
+            return "bypass"
+        return "unknown"
+
+
+@create_new_process_for_each_test("spawn")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_capture_replay_bypass_logic():
+    comp_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode="FULL",
+        cudagraph_capture_sizes=[1, 2],
+    )
+    vllm_config = _create_vllm_config(comp_config)
+    dispatcher = CudagraphDispatcher(vllm_config)
+    dispatcher.initialize_cudagraph_keys(
+        comp_config.cudagraph_mode, uniform_decode_query_len=1
+    )
+    model = SimpleMLP().to(DEVICE_TYPE)
+    full_wrapper = CUDAGraphWrapper(model, vllm_config, CUDAGraphMode.FULL)
+    max_bs = 16
+    persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE)
+    input_1 = persistent_input_buffer[:1]
+    input_2 = persistent_input_buffer[:2]
+    input_3 = persistent_input_buffer[:3]
+
+    desc_1 = BatchDescriptor(num_tokens=1)
+    desc_2 = BatchDescriptor(num_tokens=2)
+    desc_3_unseen = BatchDescriptor(num_tokens=3)
+
+    # 0. global warmup
+    with set_forward_context(
+        attn_metadata=None,
+        vllm_config=vllm_config,
+        cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        batch_descriptor=None,
+    ):
+        full_wrapper(input_1)
 
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_1.num_tokens)
-        # 1. Capture first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
-        assert action == "capture_global"
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_1.num_tokens)
+    # 1. Capture first shape
+    action = _run_and_monitor_call(full_wrapper, input_1, rt_mode, key, vllm_config)
+    assert action == "capture_global"
 
-        # 2. Replay first shape
-        action = self._run_and_monitor_call(full_wrapper, input_1, rt_mode, key)
-        assert action == "replay"
+    # 2. Replay first shape
+    action = _run_and_monitor_call(full_wrapper, input_1, rt_mode, key, vllm_config)
+    assert action == "replay"
 
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_2.num_tokens)
-        # 3. Capture second shape
-        action = self._run_and_monitor_call(full_wrapper, input_2, rt_mode, key)
-        assert action == "capture_global"
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_2.num_tokens)
+    # 3. Capture second shape
+    action = _run_and_monitor_call(full_wrapper, input_2, rt_mode, key, vllm_config)
+    assert action == "capture_global"
 
-        # 4. Replay second shape
-        action = self._run_and_monitor_call(
-            full_wrapper, input_2, CUDAGraphMode.FULL, desc_2
+    # 4. Replay second shape
+    action = _run_and_monitor_call(
+        full_wrapper, input_2, CUDAGraphMode.FULL, key, vllm_config
+    )
+    assert action == "replay"
+
+    # 5. Bypass if no key match
+    rt_mode, key = dispatcher.dispatch(num_tokens=desc_3_unseen.num_tokens)
+    assert rt_mode == CUDAGraphMode.NONE
+    action = _run_and_monitor_call(full_wrapper, input_3, rt_mode, key, vllm_config)
+    assert action == "bypass"
+
+    # capture unseen shape is not allowed after disable
+    set_cudagraph_capturing_enabled(False)
+    with pytest.raises(RuntimeError):
+        _run_and_monitor_call(
+            full_wrapper, input_3, CUDAGraphMode.FULL, desc_3_unseen, vllm_config
         )
-        assert action == "replay"
+    set_cudagraph_capturing_enabled(True)
 
-        # 5. Bypass if no key match
-        rt_mode, key = self.dispatcher.dispatch(num_tokens=desc_3_unseen.num_tokens)
-        assert rt_mode == CUDAGraphMode.NONE
-        action = self._run_and_monitor_call(full_wrapper, input_3, rt_mode, key)
-        assert action == "bypass"
-
-        # capture unseen shape is not allowed after disable
-        set_cudagraph_capturing_enabled(False)
-        with pytest.raises(RuntimeError):
-            self._run_and_monitor_call(
-                full_wrapper, input_3, CUDAGraphMode.FULL, desc_3_unseen
-            )
-        set_cudagraph_capturing_enabled(True)
-
-    @create_new_process_for_each_test("spawn")
-    def test_nested_wrappers(self):
-        """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
-        model = SimpleMLP().to(DEVICE_TYPE)
-        full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
-        input_1 = torch.randn(1, 10, device=DEVICE_TYPE)
-
-        # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
-        inner_model = SimpleMLP().to(DEVICE_TYPE)
-        piecewise_wrapper = CUDAGraphWrapper(
-            inner_model, self.vllm_config, CUDAGraphMode.PIECEWISE
-        )
-        inner_model.forward = MagicMock(wraps=inner_model.forward)
-        outer_model = SimpleMLP().to(DEVICE_TYPE)
-        # When outer model is called, it calls the piecewise_wrapper
-        outer_model.forward = MagicMock(
-            wraps=outer_model.forward, side_effect=piecewise_wrapper
-        )
-        full_wrapper = CUDAGraphWrapper(
-            outer_model, self.vllm_config, CUDAGraphMode.FULL
-        )
 
-        desc_1 = BatchDescriptor(num_tokens=1)
+@create_new_process_for_each_test("spawn")
+@pytest.mark.skipif(not current_platform.is_cuda(), reason="Skip if not cuda")
+def test_nested_wrappers():
+    """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
+    comp_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode="FULL",
+        cudagraph_capture_sizes=[1],
+    )
+    vllm_config = _create_vllm_config(comp_config)
+    dispatcher = CudagraphDispatcher(vllm_config)
+    dispatcher.initialize_cudagraph_keys(
+        comp_config.cudagraph_mode, uniform_decode_query_len=1
+    )
+    model = SimpleMLP().to(DEVICE_TYPE)
+    full_wrapper = CUDAGraphWrapper(model, vllm_config, CUDAGraphMode.FULL)
+    input_1 = torch.randn(1, 10, device=DEVICE_TYPE)
+
+    # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
+    inner_model = SimpleMLP().to(DEVICE_TYPE)
+    piecewise_wrapper = CUDAGraphWrapper(
+        inner_model, vllm_config, CUDAGraphMode.PIECEWISE
+    )
+    inner_model.forward = MagicMock(wraps=inner_model.forward)
+    outer_model = SimpleMLP().to(DEVICE_TYPE)
+    # When outer model is called, it calls the piecewise_wrapper
+    outer_model.forward = MagicMock(
+        wraps=outer_model.forward, side_effect=piecewise_wrapper
+    )
+    full_wrapper = CUDAGraphWrapper(outer_model, vllm_config, CUDAGraphMode.FULL)
 
-        # 0. global warmup
-        with set_forward_context(
-            attn_metadata=None,
-            vllm_config=self.vllm_config,
-            cudagraph_runtime_mode=CUDAGraphMode.NONE,
-            batch_descriptor=None,
-        ):
-            full_wrapper(input_1)
-
-        # --- Test runtime mode FULL---
-        # Run with FULL mode context. Expect outer wrapper to capture.
-        # The inner mock should be called once inside the graph capture.
-        outer_model.forward.reset_mock()
-        inner_model.forward.reset_mock()
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
-        )
-        assert action == "capture_global"
-        assert outer_model.forward.call_count == 1
-        assert inner_model.forward.call_count == 1
-
-        # Run again. Expect outer wrapper to replay.
-        # The outer model should NOT be called because the whole graph
-        # is replayed.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.FULL, desc_1
-        )
-        assert action == "replay"
-        assert outer_model.forward.call_count == 1  # No new call
-        assert inner_model.forward.call_count == 1
-
-        # --- Test runtime mode PIECEWISE ---
-        outer_model.forward.reset_mock()
-        inner_model.forward.reset_mock()
-        # Run with PIECEWISE mode context.
-        # Expect outer wrapper to bypass and call inner wrapper.
-        # Inner wrapper should capture.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
-        )
-        assert action == "capture_global"
-        assert outer_model.forward.call_count == 1
-        assert inner_model.forward.call_count == 1
-
-        # Run again with PIECEWISE.
-        # Outer bypasses, inner replays.
-        action = self._run_and_monitor_call(
-            full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1
-        )
-        assert action == "bypass"
-        assert outer_model.forward.call_count == 2
-        assert inner_model.forward.call_count == 1
+    desc_1 = BatchDescriptor(num_tokens=1)
+
+    # 0. global warmup
+    with set_forward_context(
+        attn_metadata=None,
+        vllm_config=vllm_config,
+        cudagraph_runtime_mode=CUDAGraphMode.NONE,
+        batch_descriptor=None,
+    ):
+        full_wrapper(input_1)
+
+    # --- Test runtime mode FULL---
+    # Run with FULL mode context. Expect outer wrapper to capture.
+    # The inner mock should be called once inside the graph capture.
+    outer_model.forward.reset_mock()
+    inner_model.forward.reset_mock()
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.FULL, desc_1, vllm_config
+    )
+    assert action == "capture_global"
+    assert outer_model.forward.call_count == 1
+    assert inner_model.forward.call_count == 1
+
+    # Run again. Expect outer wrapper to replay.
+    # The outer model should NOT be called because the whole graph
+    # is replayed.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.FULL, desc_1, vllm_config
+    )
+    assert action == "replay"
+    assert outer_model.forward.call_count == 1  # No new call
+    assert inner_model.forward.call_count == 1
+
+    # --- Test runtime mode PIECEWISE ---
+    outer_model.forward.reset_mock()
+    inner_model.forward.reset_mock()
+    # Run with PIECEWISE mode context.
+    # Expect outer wrapper to bypass and call inner wrapper.
+    # Inner wrapper should capture.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1, vllm_config
+    )
+    assert action == "capture_global"
+    assert outer_model.forward.call_count == 1
+    assert inner_model.forward.call_count == 1
+
+    # Run again with PIECEWISE.
+    # Outer bypasses, inner replays.
+    action = _run_and_monitor_call(
+        full_wrapper, input_1, CUDAGraphMode.PIECEWISE, desc_1, vllm_config
+    )
+    assert action == "bypass"
+    assert outer_model.forward.call_count == 2
+    assert inner_model.forward.call_count == 1
diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
index 5c036c1b3802..2e9f77881273 100644
--- a/tests/v1/determinism/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -12,7 +12,7 @@
 from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
-from vllm.model_executor.layers.layernorm import RMSNorm, fused_add_rms_norm
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 DEVICE_TYPE = current_platform.device_type
@@ -105,6 +105,12 @@ def test_fused_add_rms_norm_batch_invariant_residual_path(
         dim=0,
     )
 
+    def fused_add_rms_norm(x, residual, w, e) -> tuple[torch.Tensor, torch.Tensor]:
+        import vllm._custom_ops as ops
+
+        ops.fused_add_rms_norm(x, residual, w, e)
+        return x, residual
+
     out_single, residual_out_single = fused_add_rms_norm(
         x_single.clone(),
         residual_single.clone(),
diff --git a/tests/v1/distributed/test_external_lb_dp.py b/tests/v1/distributed/test_external_lb_dp.py
index cfef8449ebf8..06e8e574a05d 100644
--- a/tests/v1/distributed/test_external_lb_dp.py
+++ b/tests/v1/distributed/test_external_lb_dp.py
@@ -14,7 +14,7 @@
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
 
-MODEL_NAME = "ibm-research/PowerMoE-3b"
+MODEL_NAME = os.getenv("MODEL_NAME", "ibm-research/PowerMoE-3b")
 
 # Number of data parallel ranks for external LB testing
 DP_SIZE = int(os.getenv("DP_SIZE", "2"))
diff --git a/tests/v1/e2e/general/test_async_scheduling.py b/tests/v1/e2e/general/test_async_scheduling.py
index 28a1bedbe0b2..c3c4970de382 100644
--- a/tests/v1/e2e/general/test_async_scheduling.py
+++ b/tests/v1/e2e/general/test_async_scheduling.py
@@ -57,6 +57,8 @@ def test_without_spec_decoding(
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, frequency_penalty=-1.0),
+        dict(prompt_logprobs=2),
+        dict(prompt_logprobs=2, logprobs=2),
         dict(structured_outputs=struct_outputs),
         dict(
             structured_outputs=struct_outputs,
@@ -126,6 +128,8 @@ def test_with_eagle3_spec_decoding(sample_json_schema, monkeypatch: pytest.Monke
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, frequency_penalty=-1.0),
+        dict(prompt_logprobs=2),
+        dict(prompt_logprobs=2, logprobs=2),
         dict(structured_outputs=struct_outputs),
         dict(
             structured_outputs=struct_outputs,
@@ -413,7 +417,12 @@ def _all_logprobs_match(req_a, req_b) -> bool:
     )
 
 
-def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
+def _logprobs_match(
+    lps_a: dict[int, Logprob] | None,
+    lps_b: dict[int, Logprob] | None,
+) -> bool:
+    if lps_a is None or lps_b is None:
+        return lps_a is lps_b
     rel_tol, abs_tol = 1e-3, 1e-6
     return (
         len(lps_a) == len(lps_b)
diff --git a/tests/v1/e2e/spec_decode/test_spec_decode.py b/tests/v1/e2e/spec_decode/test_spec_decode.py
index 4a9c3faa116c..2ab2245b790c 100644
--- a/tests/v1/e2e/spec_decode/test_spec_decode.py
+++ b/tests/v1/e2e/spec_decode/test_spec_decode.py
@@ -729,15 +729,20 @@ def test_eagle_correctness_heavy(
             False,
             0.20,
         ),  # hybrid + MTP, ref: ~34%-35%
+        (
+            ("mtp", "google/gemma-4-E4B-it", 1, "google/gemma-4-E4B-it-assistant"),
+            False,
+            0.50,
+        ),  # gemma4 MTP with assistant model, ref: ~62%
     ],
-    ids=["mimo", "deepseek", "qwen3_5-hybrid"],
+    ids=["mimo", "deepseek", "qwen3_5-hybrid", "gemma4-e4b"],
 )
 @single_gpu_only
 @large_gpu_mark(min_gb=20)
 def test_mtp_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
-    model_setup: tuple[str, str, int],
+    model_setup: tuple[str, str, int] | tuple[str, str, int, str],
     mm_enabled: bool,
     expected_accuracy_threshold: float,
 ):
@@ -753,7 +758,11 @@ def test_mtp_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_MLA_DISABLE", "1")
 
-        method, model_name, tp_size = model_setup
+        if len(model_setup) == 4:
+            method, model_name, tp_size, draft_model = model_setup
+        else:
+            method, model_name, tp_size = model_setup
+            draft_model = None
         _skip_if_insufficient_gpus_for_tp(tp_size)
 
         if "Qwen3.5" in model_name and os.environ.get("VLLM_USE_V2_MODEL_RUNNER"):
@@ -764,13 +773,22 @@ def test_mtp_correctness(
 
         attn_backend = "TRITON_ATTN" if current_platform.is_rocm() else "auto"
 
-        # Qwen3.5 is a VLM; without this, profile_run runs the ViT warmup
-        # and peaks well above the 18GB MIG slice used by one of the CI
-        # lanes. This test only exercises text generation, so the vision
-        # tower is never needed.
+        # Skip multimodal profiling for models that don't need it in this test.
         extra_kwargs: dict[str, Any] = {}
         if "Qwen3.5" in model_name:
             extra_kwargs["limit_mm_per_prompt"] = {"image": 0, "video": 0}
+        elif "gemma-4" in model_name:
+            extra_kwargs["limit_mm_per_prompt"] = {"image": 0, "audio": 0}
+
+        if draft_model is not None and "gemma-4" in draft_model:
+            import transformers
+            from packaging.version import Version
+
+            if Version(transformers.__version__) < Version("5.8.0"):
+                pytest.skip(
+                    "Gemma4 MTP assistant requires transformers>=5.8.0, "
+                    f"got {transformers.__version__}"
+                )
 
         ref_llm = LLM(
             model=model_name,
@@ -788,15 +806,20 @@ def test_mtp_correctness(
         torch.accelerator.empty_cache()
         cleanup_dist_env_and_memory()
 
+        speculative_config: dict[str, Any] = {
+            "method": method,
+            "num_speculative_tokens": 1,
+            "max_model_len": 2048,
+        }
+        if draft_model is not None:
+            speculative_config["model"] = draft_model
+            speculative_config["num_speculative_tokens"] = 2
+
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
             tensor_parallel_size=tp_size,
-            speculative_config={
-                "method": method,
-                "num_speculative_tokens": 1,
-                "max_model_len": 2048,
-            },
+            speculative_config=speculative_config,
             max_model_len=2048,
             attention_backend=attn_backend,
             **extra_kwargs,
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 2dbcb307fda3..2e122680ca40 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -122,7 +122,7 @@ Quick sanity check:
 - Encoder cache should enable exact output reproduction
 - Test cleans up all instances and cache files after completion
 - Safe to run multiple times (idempotent)
-- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/online_serving/disaggregated_encoder/README.md`
+- We setup the PD disagg part with NixlConnector. Please read details about EPD in `examples/disaggregated/disaggregated_encoder/README.md`
 
 ## Requirements
 
diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
index e199a3ecea43..65716444a57c 100644
--- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
+++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh
@@ -185,7 +185,7 @@ run_epd_1e_1pd() {
 
     # Start proxy
     echo "Starting EPD proxy on port $PROXY_PORT"
-    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+    python "${GIT_ROOT}/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py" \
         --host "0.0.0.0" \
         --port "$PROXY_PORT" \
         --encode-servers-urls "http://localhost:$ENCODE_PORT" \
@@ -411,7 +411,7 @@ run_epd_1e_1p_1d() {
     
     # Start proxy
     echo "Starting EPD proxy on port $PROXY_PORT"
-    python "${GIT_ROOT}/examples/online_serving/disaggregated_encoder/disagg_epd_proxy.py" \
+    python "${GIT_ROOT}/examples/disaggregated/disaggregated_encoder/disagg_epd_proxy.py" \
         --host "0.0.0.0" \
         --port "$PROXY_PORT" \
         --encode-servers-urls "http://localhost:$ENCODE_PORT" \
diff --git a/tests/v1/kv_connector/unit/test_mooncake_stats.py b/tests/v1/kv_connector/unit/test_mooncake_stats.py
new file mode 100644
index 000000000000..a20fcb505330
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_mooncake_stats.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import threading
+from unittest.mock import MagicMock
+
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_connector import (
+    MooncakeConnector,
+    MooncakeConnectorWorker,
+    SendBlockMeta,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.stats import (
+    MooncakeKVConnectorStats,
+)
+
+
+def test_is_empty_on_fresh_stats():
+    stats = MooncakeKVConnectorStats()
+    assert stats.is_empty()
+    assert stats.num_successful_transfers == 0
+
+
+def test_record_transfer_and_reduce():
+    stats = MooncakeKVConnectorStats()
+    # 1 MB transfer in 1 ms -> 1000 MB/s throughput
+    stats.record_transfer(duration_s=0.001, total_bytes=1 * 2**20, num_descs=4)
+    # 2 MB transfer in 2 ms
+    stats.record_transfer(duration_s=0.002, total_bytes=2 * 2**20, num_descs=6)
+    assert not stats.is_empty()
+    assert stats.num_successful_transfers == 2
+
+    reduced = stats.reduce()
+    assert reduced["Num successful transfers"] == 2
+    # avg = (1 + 2) / 2 = 1.5 ms
+    assert reduced["Avg xfer time (ms)"] == 1.5
+    assert reduced["Avg MB per transfer"] == 1.5
+    # 3 MB total / 3 ms total = 1000 MB/s
+    assert reduced["Throughput (MB/s)"] == 1000.0
+    assert reduced["Avg number of descriptors"] == 5.0
+    assert reduced["Num failed transfers"] == 0
+    assert reduced["Num failed recvs"] == 0
+    assert reduced["Num KV expired reqs"] == 0
+
+
+def test_record_failures_keeps_stats_non_empty():
+    stats = MooncakeKVConnectorStats()
+    stats.record_failed_transfer()
+    stats.record_failed_recv()
+    stats.record_kv_expired_req()
+    assert not stats.is_empty()
+
+    reduced = stats.reduce()
+    # No successful transfers -> latency/throughput all zero, but failure
+    # counters still surface.
+    assert reduced["Num successful transfers"] == 0
+    assert reduced["Num failed transfers"] == 1
+    assert reduced["Num failed recvs"] == 1
+    assert reduced["Num KV expired reqs"] == 1
+
+
+def test_aggregate_sums_observations():
+    a = MooncakeKVConnectorStats()
+    b = MooncakeKVConnectorStats()
+    a.record_transfer(duration_s=0.001, total_bytes=1 * 2**20, num_descs=1)
+    b.record_transfer(duration_s=0.002, total_bytes=2 * 2**20, num_descs=2)
+    b.record_failed_transfer()
+
+    a.aggregate(b)
+
+    assert a.num_successful_transfers == 2
+    reduced = a.reduce()
+    assert reduced["Num successful transfers"] == 2
+    assert reduced["Num failed transfers"] == 1
+
+
+def test_aggregate_with_empty_other_is_noop():
+    a = MooncakeKVConnectorStats()
+    a.record_transfer(duration_s=0.001, total_bytes=1, num_descs=1)
+    b = MooncakeKVConnectorStats()
+
+    a.aggregate(b)
+
+    assert a.num_successful_transfers == 1
+
+
+def test_getstate_drops_lock_and_setstate_recreates_it():
+    # KVConnectorStats subclasses must be picklable (worker→scheduler IPC),
+    # but threading.Lock isn't — so __getstate__ strips it and __setstate__
+    # rebuilds a fresh per-process lock.
+    original = MooncakeKVConnectorStats()
+    original.record_transfer(duration_s=0.01, total_bytes=2048, num_descs=3)
+
+    state = original.__getstate__()
+    assert "_lock" not in state
+
+    rebuilt = MooncakeKVConnectorStats.__new__(MooncakeKVConnectorStats)
+    rebuilt.__setstate__(state)
+    assert rebuilt.data == original.data
+    # Lock works on the receiver side.
+    rebuilt.record_transfer(duration_s=0.02, total_bytes=4096, num_descs=5)
+    assert rebuilt.num_successful_transfers == 2
+
+
+def test_concurrent_writers_keep_row_lengths_aligned():
+    # Multiple writers + a snapshot reader must never produce a snapshot
+    # with mismatched column lengths — reduce()'s
+    # len(descs) == num_successful_transfers assertion would fire.
+    stats = MooncakeKVConnectorStats()
+    stop = threading.Event()
+    writer_count = 4
+    snapshots: list[MooncakeKVConnectorStats] = []
+
+    def writer():
+        i = 0
+        while not stop.is_set():
+            stats.record_transfer(
+                duration_s=0.001 + i * 1e-9,
+                total_bytes=1024 + i,
+                num_descs=1 + (i % 8),
+            )
+            i += 1
+
+    def snapper():
+        while not stop.is_set():
+            snap = stats.clone_and_reset()
+            if not snap.is_empty():
+                # Force the same path the logger walks; reduce() will
+                # blow up on torn rows via its internal assert.
+                snap.reduce()
+                snapshots.append(snap)
+
+    threads = [threading.Thread(target=writer) for _ in range(writer_count)]
+    snapshotter = threading.Thread(target=snapper)
+    for t in threads:
+        t.start()
+    snapshotter.start()
+    # Short fixed window — long enough to interleave thousands of ops.
+    threading.Event().wait(0.2)
+    stop.set()
+    for t in threads:
+        t.join()
+    snapshotter.join()
+
+    # Final drain so we don't lose the in-flight tail.
+    final = stats.clone_and_reset()
+    if not final.is_empty():
+        final.reduce()
+        snapshots.append(final)
+
+    # Every snapshot's columns must have identical lengths (the invariant
+    # the lock protects), and the union must contain at least one row.
+    total_rows = 0
+    for snap in snapshots:
+        n = len(snap.data["transfer_duration"])
+        assert len(snap.data["bytes_transferred"]) == n
+        assert len(snap.data["num_descriptors"]) == n
+        total_rows += n
+    assert total_rows > 0
+
+
+def test_clone_and_reset_hands_off_old_data():
+    stats = MooncakeKVConnectorStats()
+    stats.record_transfer(duration_s=0.001, total_bytes=1, num_descs=1)
+    stats.record_failed_recv()
+
+    snapshot = stats.clone_and_reset()
+
+    assert snapshot.num_successful_transfers == 1
+    assert not snapshot.is_empty()
+    # Original is now empty.
+    assert stats.is_empty()
+    assert stats.num_successful_transfers == 0
+    # Recording on the original does not mutate the snapshot.
+    stats.record_transfer(duration_s=0.005, total_bytes=2, num_descs=2)
+    assert snapshot.num_successful_transfers == 1
+
+
+def test_build_kv_connector_stats_none_returns_empty_instance():
+    out = MooncakeConnector.build_kv_connector_stats()
+    assert isinstance(out, MooncakeKVConnectorStats)
+    assert out.is_empty()
+
+
+def test_build_kv_connector_stats_with_data_round_trips():
+    original = MooncakeKVConnectorStats()
+    original.record_transfer(duration_s=0.01, total_bytes=1024, num_descs=3)
+    original.record_failed_transfer()
+
+    # Serialized form is the .data dict; build should reconstruct an instance
+    # that behaves the same.
+    rebuilt = MooncakeConnector.build_kv_connector_stats(data=original.data)
+
+    assert isinstance(rebuilt, MooncakeKVConnectorStats)
+    assert rebuilt.num_successful_transfers == 1
+    assert rebuilt.reduce()["Num failed transfers"] == 1
+
+
+def _bare_worker() -> MooncakeConnectorWorker:
+    """Construct a MooncakeConnectorWorker skipping __init__ (full init requires
+    a live TransferEngine). Only the attributes touched by the methods under
+    test are populated; role flags and async_zmq_ctx keep __del__'s shutdown
+    path a no-op."""
+    worker = MooncakeConnectorWorker.__new__(MooncakeConnectorWorker)
+    worker.xfer_stats = MooncakeKVConnectorStats()
+    worker.engine = MagicMock()
+    worker.async_zmq_ctx = MagicMock()
+    worker.is_kv_consumer = True
+    worker.is_kv_producer = True
+    return worker
+
+
+def test_send_blocks_records_success():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 0
+
+    ret = worker._send_blocks(
+        "host:1234",
+        src_ptrs=[0x1000, 0x2000],
+        dst_ptrs=[0x3000, 0x4000],
+        lengths=[1024, 2048],
+    )
+
+    assert ret == 0
+    assert worker.xfer_stats.num_successful_transfers == 1
+    data = worker.xfer_stats.data
+    assert data["bytes_transferred"] == [1024 + 2048]
+    assert data["num_descriptors"] == [2]
+    assert data["num_failed_transfers"] == []
+
+
+def test_send_blocks_records_failure():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 1  # non-zero = fail
+
+    ret = worker._send_blocks("host:1234", [0x1000], [0x2000], [4096])
+
+    assert ret == 1
+    assert worker.xfer_stats.num_successful_transfers == 0
+    assert worker.xfer_stats.data["num_failed_transfers"] == [1]
+
+
+def test_get_kv_connector_stats_returns_none_when_empty():
+    worker = _bare_worker()
+
+    assert worker.get_kv_connector_stats() is None
+
+
+def test_get_kv_connector_stats_returns_and_resets():
+    worker = _bare_worker()
+    worker.engine.batch_transfer_sync_write.return_value = 0
+    worker._send_blocks("host:1234", [0x1000], [0x2000], [4096])
+
+    snapshot = worker.get_kv_connector_stats()
+    assert isinstance(snapshot, MooncakeKVConnectorStats)
+    assert snapshot.num_successful_transfers == 1
+
+    # Second call returns None because the worker's stats were reset.
+    assert worker.get_kv_connector_stats() is None
+
+
+def test_expired_request_bumps_counter():
+    import asyncio
+
+    worker = _bare_worker()
+    worker.reqs_need_send = {
+        "tid1": SendBlockMeta(
+            p_req_id="req1",
+            transfer_id="tid1",
+            local_block_ids=[0, 1],
+            ready=asyncio.Event(),
+            expire_time=-1.0,  # Already expired.
+            sending=0,
+        ),
+    }
+    worker.finished_sending_reqs = set()
+
+    asyncio.run(worker.fetch_finished_sending_reqs())
+
+    assert worker.xfer_stats.data["num_kv_expired_reqs"] == [1]
+    # Expired transfer also cleaned out of reqs_need_send.
+    assert "tid1" not in worker.reqs_need_send
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index fb4b641e1376..3803e4fd3869 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -527,6 +527,7 @@ def _nixl_handshake(
                     block_size=self.block_size,
                     ssm_sizes=(0, 0),
                     attn_backend_name=self.backend_name,
+                    physical_blocks_per_logical_kv_block=1,
                 ),
                 remote_tp_rank=remote_tp_rank,
                 remote_tp_size=remote_tp_size,
@@ -726,6 +727,7 @@ def test_prefill_tp_size_greater_than_decode_tp_size(
         worker.num_blocks = 1
         worker.dst_num_blocks[worker.engine_id] = worker.num_blocks
         worker.src_blocks_data = [(0, worker.block_len_per_layer[0], worker.tp_rank)]
+        worker.num_descs = len(worker.src_blocks_data)
 
         def check_handshake(remote_tp_size: int):
             tp_ratio = remote_tp_size // local_tp_size
@@ -978,6 +980,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(
                 block_size=worker.block_size,
                 ssm_sizes=(0, 0),
                 attn_backend_name=worker.backend_name,
+                physical_blocks_per_logical_kv_block=1,
             )
 
             with pytest.raises(RuntimeError):
@@ -1035,6 +1038,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_size=worker.block_size,
                 ssm_sizes=(0, 0),
                 attn_backend_name=worker.backend_name,
+                physical_blocks_per_logical_kv_block=1,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
@@ -2354,6 +2358,7 @@ def test_compatibility_hash_validation(
         block_size=prefill_block_size,
         ssm_sizes=(0, 0),
         attn_backend_name=decode_worker.backend_name,
+        physical_blocks_per_logical_kv_block=1,
     )
     handshake_payload = NixlHandshakePayload(
         compatibility_hash=remote_hash,
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
index 3f5a9b9cc031..edc591e06090 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector_hma.py
@@ -93,75 +93,82 @@ def test_logical_to_kernel_block_ids_with_hma():
 
 @pytest.mark.cpu_test
 @pytest.mark.parametrize(
-    "has_mamba,swa_enabled,mamba_enabled,remote_ratio,"
-    "remote_block_ids,expected_remote_block_ids",
+    "group_spec_types,expansion_stride,remote_block_ids,expected_remote_block_ids",
     [
-        # Non-mamba (FA+SWA): both groups expanded via _logical_to_kernel_block_ids.
-        # Regression for https://github.com/vllm-project/vllm/pull/39724
-        (
-            False,
-            True,
-            False,
-            1,
+        pytest.param(
+            ("FullAttentionSpec", "SlidingWindowSpec"),
+            2,
             ([0, 1, 2], [3, 4]),
             [[0, 1, 2, 3, 4, 5], [6, 7, 8, 9]],
+            id="dense_fa_swa",
         ),
-        # Mamba (FA+Mamba): FA expanded via _logical_to_remote_kernel_block_ids,
-        # Mamba passed through unchanged.
-        # remote_ratio=261 (Nemotron 30B TP=1) != local_ratio=2 so that using
-        # the wrong conversion method produces different FA results.
-        (
-            True,
-            False,
-            True,
+        pytest.param(
+            ("FullAttentionSpec", "MambaSpec"),
             261,
             ([0, 1, 2], [10, 11]),
             [[0, 1, 261, 262, 522, 523], [10, 11]],
+            id="mamba_fa_ssm",
         ),
     ],
-    ids=["non_mamba_fa_swa", "mamba_fa_ssm"],
 )
 def test_read_blocks_for_req_expands_remote_ids(
-    has_mamba,
-    swa_enabled,
-    mamba_enabled,
-    remote_ratio,
+    group_spec_types,
+    expansion_stride,
     remote_block_ids,
     expected_remote_block_ids,
 ):
     """_read_blocks_for_req must expand remote logical block IDs to kernel
     block IDs when kernel block size != logical block size.
 
-    Non-mamba path uses _logical_to_kernel_block_ids (all groups expanded).
-    Mamba path uses _logical_to_remote_kernel_block_ids (FA expanded, Mamba
-    passed through).
+    The hot path always calls _logical_to_remote_kernel_block_ids with
+    remote_info.remote_physical_blocks_per_logical (model-agnostic).
     """
     from unittest.mock import MagicMock
 
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.metadata import (
         NixlConnectorMetadata,
     )
+    from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+        TPMapping,
+    )
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
         NixlConnectorWorker,
     )
+    from vllm.v1.kv_cache_interface import (
+        FullAttentionSpec,
+        MambaSpec,
+        SlidingWindowSpec,
+    )
+
+    spec_name_to_type = {
+        "FullAttentionSpec": FullAttentionSpec,
+        "SlidingWindowSpec": SlidingWindowSpec,
+        "MambaSpec": MambaSpec,
+    }
+    resolved_types = tuple(spec_name_to_type[n] for n in group_spec_types)
 
     worker = object.__new__(NixlConnectorWorker)
-    worker._has_mamba = has_mamba
     worker._physical_blocks_per_logical_kv_block = 2
+
+    has_mamba = any(t is MambaSpec for t in resolved_types)
+    has_swa = any(t is SlidingWindowSpec for t in resolved_types)
     worker.kv_cache_config = make_kv_cache_config(
-        block_size=16, swa_enabled=swa_enabled, mamba_enabled=mamba_enabled
+        block_size=16, swa_enabled=has_swa, mamba_enabled=has_mamba
     )
 
     remote_engine_id = "remote-engine"
-    if has_mamba:
-        worker._physical_blocks_per_logical = {remote_engine_id: remote_ratio}
 
-    # Mock transfer_topo: empty remote ranks skips the transfer machinery
-    # entirely, isolating the block-ID expansion logic.
     worker.transfer_topo = MagicMock()
-    worker.transfer_topo.target_remote_ranks.return_value = []
-    worker.transfer_topo.get_engine_info.return_value = MagicMock(remote_tp_size=1)
     worker.transfer_topo.tp_ratio.return_value = 1
+    remote_info = MagicMock()
+    remote_info.remote_physical_blocks_per_logical = expansion_stride
+    worker.transfer_topo.get_engine_info.return_value = remote_info
+    worker.use_mla = False
+
+    mock_plan = MagicMock(spec=TPMapping)
+    mock_plan.all_source_ranks = ()
+    mock_plan.source_ranks_per_group = ()
+    worker.tp_mappings = {remote_engine_id: mock_plan}
 
     metadata = NixlConnectorMetadata()
     metadata.add_new_req_to_recv(
@@ -306,75 +313,82 @@ def test_nixl_metadata_hma_block_ids_structure():
     assert list(req_meta.remote.block_ids[1]) == [18, 19, 20, 21]
 
 
-@pytest.mark.cpu_test
-def test_get_block_descs_ids_hybrid_ssm():
-    """Test _get_block_descs_ids uses per-group strides for hybrid FA+SSM
-    when ratio=1 (no kernel block size mismatch)."""
+def _make_mock_worker_for_desc_ids(
+    num_regions: int,
+    has_mamba: bool,
+    group_spec_types: tuple,
+    block_len_per_layer: list[int] | None = None,
+):
+    """Build a mock NixlConnectorWorker with attrs needed by _compute_desc_ids."""
+    from unittest.mock import MagicMock
+
     from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
         NixlConnectorWorker,
     )
 
-    worker = object.__new__(NixlConnectorWorker)
+    worker = MagicMock(spec=NixlConnectorWorker)
+    worker.num_regions = num_regions
+    worker._has_mamba = has_mamba
+    worker._group_spec_types = group_spec_types
+    worker.block_len_per_layer = block_len_per_layer or [100]
+    worker._compute_desc_ids = NixlConnectorWorker._compute_desc_ids.__get__(
+        worker, NixlConnectorWorker
+    )
+    return worker
 
-    num_blocks = 100
-    engine_id = "test-engine"
-    worker.num_regions = 2
-    worker.dst_num_blocks = {engine_id: num_blocks}
-    worker._has_mamba = True
-    worker._is_mamba_group = [False, True]
-    worker._physical_blocks_per_logical_kv_block = 1
-    worker._physical_blocks_per_logical = {engine_id: 1}
-    worker.block_len_per_layer = [100]
-    # num_descs = num_regions * num_blocks (no blocks_first doubling)
-    worker.num_descs = 2 * num_blocks
+
+@pytest.mark.cpu_test
+def test_get_block_descs_ids_hybrid_ssm():
+    """Test _compute_desc_ids uses per-group strides for hybrid
+    FA+SSM when ratio=1 (no kernel block size mismatch)."""
+    from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+
+    worker = _make_mock_worker_for_desc_ids(
+        num_regions=2,
+        has_mamba=True,
+        group_spec_types=(FullAttentionSpec, MambaSpec),
+        block_len_per_layer=[100],
+    )
 
     fa_blocks = [3, 5]
     ssm_blocks = [1, 2]
-    result = worker._get_block_descs_ids(engine_id, (fa_blocks, ssm_blocks))
-
-    # FA group: stride=num_blocks=100, offset=0
-    #   region0: [3, 5],  region1: [103, 105]
-    # SSM group: stride=logical_blocks=100 (=num_blocks/ratio=100/1),
-    #   offset=num_fa_descs=200, 4 regions per Mamba layer (x, B, C, ssm)
-    #   region0: [201, 202], region1: [301, 302],
-    #   region2: [401, 402], region3: [501, 502]
+    result = worker._compute_desc_ids(
+        block_ids=(fa_blocks, ssm_blocks),
+        dst_num_blocks=100,
+        block_size_ratio=None,
+        physical_blocks_per_logical=1,
+    )
+
     expected = [3, 5, 103, 105, 201, 202, 301, 302, 401, 402, 501, 502]
     assert list(result) == expected, f"Expected {expected}, got {list(result)}"
 
 
 @pytest.mark.cpu_test
 def test_get_block_descs_ids_kernel_block_mismatch():
-    """Test _get_block_descs_ids uses different strides for FA (kernel blocks)
-    vs SSM (logical blocks) when ratio > 1."""
-    from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
-        NixlConnectorWorker,
-    )
-
-    worker = object.__new__(NixlConnectorWorker)
+    """Test _compute_desc_ids uses different strides for FA
+    (kernel blocks) vs SSM (logical blocks) when ratio > 1."""
+    from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
 
     ratio = 4
     logical_blocks = 100
     num_blocks = logical_blocks * ratio  # 400 kernel blocks
-    engine_id = "test-engine"
-    worker.num_regions = 2
-    worker.dst_num_blocks = {engine_id: num_blocks}
-    worker._has_mamba = True
-    worker._is_mamba_group = [False, True]
-    worker._physical_blocks_per_logical_kv_block = ratio
-    worker._physical_blocks_per_logical = {engine_id: ratio}
-    worker.block_len_per_layer = [100]
-    worker.num_descs = 2 * num_blocks  # 800
-
-    fa_blocks = [3, 7]  # kernel-level block IDs
-    ssm_blocks = [1, 2]  # logical block IDs
-    result = worker._get_block_descs_ids(engine_id, (fa_blocks, ssm_blocks))
-
-    # FA group: stride=num_blocks=400, offset=0
-    #   region0: [3, 7],  region1: [403, 407]
-    # SSM group: stride=logical_blocks=400//4=100, offset=num_fa_descs=800,
-    #   4 regions per Mamba layer (x, B, C, ssm)
-    #   region0: [801, 802], region1: [901, 902],
-    #   region2: [1001, 1002], region3: [1101, 1102]
+
+    worker = _make_mock_worker_for_desc_ids(
+        num_regions=2,
+        has_mamba=True,
+        group_spec_types=(FullAttentionSpec, MambaSpec),
+        block_len_per_layer=[100],
+    )
+
+    fa_blocks = [3, 7]
+    ssm_blocks = [1, 2]
+    result = worker._compute_desc_ids(
+        block_ids=(fa_blocks, ssm_blocks),
+        dst_num_blocks=num_blocks,
+        block_size_ratio=None,
+        physical_blocks_per_logical=ratio,
+    )
+
     expected = [3, 7, 403, 407, 801, 802, 901, 902, 1001, 1002, 1101, 1102]
     assert list(result) == expected, f"Expected {expected}, got {list(result)}"
 
diff --git a/tests/v1/kv_connector/unit/test_tp_mapping.py b/tests/v1/kv_connector/unit/test_tp_mapping.py
new file mode 100644
index 000000000000..e57244a31f79
--- /dev/null
+++ b/tests/v1/kv_connector/unit/test_tp_mapping.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for TP mapping and transfer plan utilities.
+
+These tests verify that TP mapping produces correct outputs
+(source ranks, split handles, desc IDs).
+No GPU or NIXL required.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+    TPMapping,
+    compute_tp_mapping,
+)
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.worker import (
+    NixlConnectorWorker,
+)
+from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec
+
+# ======================================================================
+# Test fixtures / helpers
+# ======================================================================
+
+
+def _compute_mapping(
+    tp_rank: int = 0,
+    tp_size: int = 1,
+    remote_tp_size: int = 1,
+    is_mla: bool = False,
+    num_kv_heads: int = 8,
+    group_spec_types: tuple[type, ...] = (FullAttentionSpec,),
+) -> TPMapping:
+    return compute_tp_mapping(
+        tp_rank=tp_rank,
+        tp_size=tp_size,
+        remote_tp_size=remote_tp_size,
+        is_mla=is_mla,
+        total_num_kv_heads=num_kv_heads,
+        group_spec_types=group_spec_types,
+    )
+
+
+# ======================================================================
+# TP mapping structure tests
+# ======================================================================
+
+
+class TestTPMappingStructure:
+    def test_source_ranks_homogeneous(self):
+        m = _compute_mapping(tp_size=2, tp_rank=1, remote_tp_size=2)
+        assert m.all_source_ranks == (1,)
+
+    def test_source_ranks_d_gt_p(self):
+        m = _compute_mapping(tp_size=4, tp_rank=2, remote_tp_size=2)
+        assert m.all_source_ranks == (1,)
+
+    def test_source_ranks_p_gt_d(self):
+        m = _compute_mapping(tp_size=1, tp_rank=0, remote_tp_size=2)
+        assert m.all_source_ranks == (0, 1)
+
+
+# ======================================================================
+# Split handle tests
+# ======================================================================
+
+
+def _make_mock_worker_for_splits(group_spec_types):
+    """Build a mock NixlConnectorWorker with _group_spec_types for split tests."""
+    worker = object.__new__(NixlConnectorWorker)
+    worker._group_spec_types = group_spec_types
+    return worker
+
+
+class TestBuildSrcSplitHandles:
+    @pytest.mark.parametrize("remote_tp_size", [2, 4])
+    def test_build_src_split_handles(self, remote_tp_size):
+        tp_rank = 0
+        tp_size = 1
+
+        plan = _compute_mapping(
+            tp_rank=tp_rank,
+            tp_size=tp_size,
+            remote_tp_size=remote_tp_size,
+        )
+
+        worker = _make_mock_worker_for_splits((FullAttentionSpec,))
+        src_blocks_data = [(0x2000 + i * 1024, 1024, 0) for i in range(8)]
+        num_descs = len(src_blocks_data)
+        splits = list(
+            worker._build_local_splits_from_plan(
+                plan,
+                src_blocks_data,
+                num_descs,
+            )
+        )
+
+        assert len(splits) == remote_tp_size
+        for handle in splits:
+            assert len(handle) == len(src_blocks_data)
+            for _, length, _ in handle:
+                assert length == 1024 // remote_tp_size
+
+
+class TestMambaPlanSplitHandles:
+    """Verify split handles for Mamba with FA/SSM distinction."""
+
+    def test_fa_and_ssm_different_split_factors(self):
+        """Section 0 split by num_attn_reads, section 1 by abs_tp."""
+        fa_readers = (0,)
+        ssm_readers = (0, 1)
+        plan = TPMapping(
+            source_ranks_per_group=(fa_readers, ssm_readers),
+            all_source_ranks=(0, 1),
+            rank_to_attention_slot={0: 0, 1: 0},
+            rank_offset_factor=0,
+        )
+
+        worker = _make_mock_worker_for_splits((FullAttentionSpec, MambaSpec))
+        # 2 FA descs + 1 SSM desc
+        src_blocks_data = [
+            (1000, 200, 0),  # FA desc 0
+            (2000, 200, 0),  # FA desc 1
+            (3000, 400, 0),  # SSM desc 0
+        ]
+
+        splits = list(worker._build_local_splits_from_plan(plan, src_blocks_data, 2))
+
+        assert len(splits) == 2  # 2 source ranks
+
+        # Rank 0 (FA source, p_idx=0):
+        # FA: chunk=200//1=200, slot=0 → (1000, 200, 0), (2000, 200, 0)
+        # SSM: chunk=400//2=200, idx=0 → (3000, 200, 0)
+        assert splits[0] == [(1000, 200, 0), (2000, 200, 0), (3000, 200, 0)]
+
+        # Rank 1 (not FA source, p_idx=1):
+        # FA: chunk=200//1=200, slot=0 (skip_fa) → (1000, 200, 0), (2000, 200, 0)
+        # SSM: chunk=400//2=200, idx=1 → (3200, 200, 0)
+        assert splits[1] == [(1000, 200, 0), (2000, 200, 0), (3200, 200, 0)]
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index 8e4e1cae0676..00ddbc77ac7c 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -469,6 +469,9 @@ def make_kv_cache_config(
                     block_size=block_size,
                     shapes=((16,), (16,)),
                     dtypes=(torch.float16,),
+                    # These connector tests exercise the legacy shared-pool
+                    # prefix-cache path, not request-constant Mamba allocation.
+                    mamba_cache_mode="all",
                 ),
             )
         )
diff --git a/tests/v1/kv_offload/cpu/test_manager.py b/tests/v1/kv_offload/cpu/test_manager.py
index ef5d61e7b3d2..e043590a4184 100644
--- a/tests/v1/kv_offload/cpu/test_manager.py
+++ b/tests/v1/kv_offload/cpu/test_manager.py
@@ -163,9 +163,9 @@ def test_cpu_manager():
         ),
     )
 
-    # lookup [1, 2] -> not ready
-    assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is False
-    assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is False
+    # lookup [1, 2] -> write in-flight, not yet ready
+    assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is None
+    assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is None
 
     # no events so far
     assert list(cpu_manager.take_events()) == []
@@ -296,9 +296,9 @@ def test_basic(self):
             ),
         )
 
-        # lookup [1, 2] -> not ready
-        assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is False
-        assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is False
+        # lookup [1, 2] -> write in-flight, not yet ready
+        assert cpu_manager.lookup(to_key(1), _EMPTY_REQ_CTX) is None
+        assert cpu_manager.lookup(to_key(2), _EMPTY_REQ_CTX) is None
 
         # no events so far
         assert list(cpu_manager.take_events()) == []
diff --git a/tests/v1/logits_processors/test_custom_online.py b/tests/v1/logits_processors/test_custom_online.py
index 3dc6b8979015..05ac70349736 100644
--- a/tests/v1/logits_processors/test_custom_online.py
+++ b/tests/v1/logits_processors/test_custom_online.py
@@ -10,7 +10,7 @@
 import pytest
 import pytest_asyncio
 
-from tests.utils import RemoteOpenAIServerCustom, create_new_process_for_each_test
+from tests.utils import RemoteOpenAIServerCustom
 from tests.v1.logits_processors.utils import (
     DUMMY_LOGITPROC_ARG,
     DUMMY_LOGITPROC_FQCN,
@@ -119,13 +119,12 @@ async def client(server):
 }
 
 
-@create_new_process_for_each_test()
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
     [MODEL_NAME],
 )
-async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
+def test_custom_logitsprocs(server, model_name: str):
     """Test custom logitsprocs when starting OpenAI server from CLI
 
     Launch vLLM OpenAI-compatible server, configured to load a custom logitproc
@@ -139,36 +138,45 @@ async def test_custom_logitsprocs(client: openai.AsyncOpenAI, model_name: str):
     token
     """
 
-    use_dummy_logitproc = True
-    for prompt in prompts:
-        # Build request arguments
-        request_keyword_args: dict[str, Any] = {
-            **api_keyword_args,
-        }
-        if use_dummy_logitproc:
-            # 50% of requests pass target_token custom arg
-            target_token = random.choice([128, 67])
-            # For requests which activate the dummy logitproc, choose one of
-            # two `target_token` values which are known not to be EOS tokens
-            request_keyword_args["extra_body"] = {
-                "vllm_xargs": {DUMMY_LOGITPROC_ARG: target_token}
-            }
-        batch = await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            **request_keyword_args,
-        )
+    import asyncio
 
-        if use_dummy_logitproc:
-            # Only for requests which activate dummy logitproc - validate that
-            # output token is repeated
-            choices: openai.types.CompletionChoice = batch.choices
-            toks = choices[0].logprobs.tokens
-            if not all([x == toks[0] for x in toks]):
-                raise AssertionError(f"Generated {toks} should all be {toks[0]}")
+    async def _async_main(srv, mn):
+        async with srv.get_async_client() as client:
+            await _run(client)
 
-        # Alternate whether to activate dummy logitproc for each request
-        use_dummy_logitproc = not use_dummy_logitproc
+    async def _run(client):
+        use_dummy_logitproc = True
+        for prompt in prompts:
+            # Build request arguments
+            request_keyword_args: dict[str, Any] = {
+                **api_keyword_args,
+            }
+            if use_dummy_logitproc:
+                # 50% of requests pass target_token custom arg
+                target_token = random.choice([128, 67])
+                # For requests which activate the dummy logitproc, choose one of
+                # two `target_token` values which are known not to be EOS tokens
+                request_keyword_args["extra_body"] = {
+                    "vllm_xargs": {DUMMY_LOGITPROC_ARG: target_token}
+                }
+            batch = await client.completions.create(
+                model=model_name,
+                prompt=prompt,
+                **request_keyword_args,
+            )
+
+            if use_dummy_logitproc:
+                # Only for requests which activate dummy logitproc - validate that
+                # output token is repeated
+                choices: openai.types.CompletionChoice = batch.choices
+                toks = choices[0].logprobs.tokens
+                if not all([x == toks[0] for x in toks]):
+                    raise AssertionError(f"Generated {toks} should all be {toks[0]}")
+
+            # Alternate whether to activate dummy logitproc for each request
+            use_dummy_logitproc = not use_dummy_logitproc
+
+    asyncio.run(_async_main(server, model_name))
 
 
 @pytest.mark.asyncio
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 28fb2931b229..460e0d685649 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -33,11 +33,10 @@
 SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
 
 # On ROCm, floating-point reductions in attention and GEMM kernels are
-# non-associative and sensitive to batch geometry. The ref LLM (no spec
-# decode, default scheduling) and the spec-decode LLM (chunked prefill,
-# different effective batch sizes) follow different reduction orders,
-# producing numerically divergent logprobs that get misattributed to
-# spec-decode incorrectness.
+# non-associative and sensitive to batch geometry. If the ref LLM and
+# spec-decode LLM use different scheduling or batch geometry, they can
+# follow different reduction orders and produce numerically divergent
+# logprobs that get misattributed to spec-decode incorrectness.
 #
 # Force LLM instances into an identical, deterministic execution
 # mode so the test isolates spec-decode correctness only:
@@ -1086,18 +1085,25 @@ def test_spec_decode_logprobs(
     )
 
     max_model_len = 256
-
-    # Run base LLM.
-    ref_llm = LLM(
-        model=model_name,
+    llm_kwargs = dict(
         max_logprobs=5,
         max_model_len=max_model_len,
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        # Force the same prefill chunking for both the base model and
+        # spec decode model so the comparison isolates spec decode.
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
         enable_prefix_caching=False,
         **ROCM_DETERMINISM_KWARGS,
     )
+
+    # Run base LLM.
+    ref_llm = LLM(
+        model=model_name,
+        **llm_kwargs,
+    )
     ref_results = ref_llm.generate(
         [prompt, prompt], [sampling_params, penalty_sampling_params]
     )
@@ -1117,16 +1123,7 @@ def test_spec_decode_logprobs(
     spec_llm = LLM(
         model_name,
         speculative_config=spec_config_with_len,
-        max_logprobs=5,
-        max_model_len=max_model_len,
-        seed=42,
-        logprobs_mode=logprobs_mode,
-        gpu_memory_utilization=0.4,
-        # Force prefill chunking
-        enable_chunked_prefill=True,
-        max_num_batched_tokens=32,
-        enable_prefix_caching=False,
-        **ROCM_DETERMINISM_KWARGS,
+        **llm_kwargs,
     )
     spec_results = spec_llm.generate(
         [prompt, prompt], [sampling_params, penalty_sampling_params]
diff --git a/tests/v1/simple_kv_offload/test_integration.py b/tests/v1/simple_kv_offload/test_integration.py
index 29399516be18..02f6360e08e8 100644
--- a/tests/v1/simple_kv_offload/test_integration.py
+++ b/tests/v1/simple_kv_offload/test_integration.py
@@ -10,8 +10,8 @@
 from vllm.config import KVTransferConfig
 from vllm.platforms import current_platform
 
-if not current_platform.is_cuda():
-    pytest.skip("Requires CUDA", allow_module_level=True)
+if not current_platform.is_cuda_alike():
+    pytest.skip("Requires CUDA or ROCm", allow_module_level=True)
 
 # Small models for default CI / local runs (accuracy only).
 SMALL_MODELS = [
diff --git a/tests/v1/simple_kv_offload/test_scheduler.py b/tests/v1/simple_kv_offload/test_scheduler.py
index 132f52fe3b36..7c19390bb86b 100644
--- a/tests/v1/simple_kv_offload/test_scheduler.py
+++ b/tests/v1/simple_kv_offload/test_scheduler.py
@@ -6,6 +6,7 @@
 
 from dataclasses import dataclass
 
+import pytest
 import torch
 
 from vllm import SamplingParams
@@ -34,7 +35,10 @@
     FullAttentionSpec,
     KVCacheConfig,
     KVCacheGroupSpec,
+    KVCachePoolConfig,
     KVCacheTensor,
+    MambaSpec,
+    MemoryModel,
 )
 from vllm.v1.outputs import KVConnectorOutput
 from vllm.v1.request import Request
@@ -94,6 +98,59 @@ def _make_kv_cache_config(
     )
 
 
+def _make_request_constant_kv_cache_config(num_blocks: int = 8) -> KVCacheConfig:
+    attention_spec = FullAttentionSpec(
+        block_size=BLOCK_SIZE,
+        num_kv_heads=NUM_KV_HEADS,
+        head_size=HEAD_SIZE,
+        dtype=DTYPE,
+    )
+    mamba_spec = MambaSpec(
+        block_size=BLOCK_SIZE,
+        shapes=((1,),),
+        dtypes=(DTYPE,),
+        mamba_cache_mode="none",
+        page_size_padded=attention_spec.page_size_bytes,
+    )
+    mamba_num_blocks = 3
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=attention_spec.page_size_bytes * num_blocks,
+                shared_by=["attn"],
+            ),
+            KVCacheTensor(
+                size=mamba_spec.physical_page_size_bytes * mamba_num_blocks,
+                shared_by=["mamba"],
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["attn"], attention_spec),
+            KVCacheGroupSpec(["mamba"], mamba_spec),
+        ],
+        pool_configs=(
+            KVCachePoolConfig(
+                pool_id=0,
+                memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+                group_ids=(0,),
+                num_blocks=num_blocks,
+                accounting_page_size_bytes=attention_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=attention_spec.physical_page_size_bytes,
+            ),
+            KVCachePoolConfig(
+                pool_id=1,
+                memory_model=MemoryModel.REQUEST_CONSTANT,
+                group_ids=(1,),
+                num_blocks=mamba_num_blocks,
+                accounting_page_size_bytes=mamba_spec.accounting_page_size_bytes,
+                physical_page_size_bytes=mamba_spec.physical_page_size_bytes,
+            ),
+        ),
+        group_to_pool_id=(0, 1),
+    )
+
+
 def _make_vllm_config(block_size: int = BLOCK_SIZE) -> VllmConfig:
     """Minimal VllmConfig for scheduler tests (no GPU)."""
     model_config = ModelConfig(
@@ -173,6 +230,19 @@ def make_scheduler(
     )
 
 
+def test_cpu_offload_rejects_request_constant_kv_cache():
+    kv_cache_config = _make_request_constant_kv_cache_config()
+
+    with pytest.raises(
+        NotImplementedError,
+        match="CPU KV cache offload with REQUEST_CONSTANT specs",
+    ):
+        SimpleCPUOffloadScheduler._derive_cpu_config(
+            kv_cache_config,
+            cpu_capacity_bytes=_BYTES_PER_BLOCK,
+        )
+
+
 _req_counter = 0
 
 
@@ -186,7 +256,13 @@ def make_request(
     if request_id is None:
         request_id = f"req-{_req_counter}"
 
-    num_tokens = num_blocks * BLOCK_SIZE
+    # Add one extra token beyond the last full block so that
+    # ``max_cache_hit_length = num_tokens - 1`` (see
+    # KVCacheManager.get_computed_blocks) does not truncate the final
+    # full block: ``find_longest_cache_hit`` uses
+    # ``max_length // block_size`` and would otherwise drop one block
+    # when the prompt is an exact multiple of block_size.
+    num_tokens = num_blocks * BLOCK_SIZE + 1
     start = _req_counter * 10000
     prompt_token_ids = list(range(start, start + num_tokens))
     sampling_params = SamplingParams(max_tokens=1)
diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py
index 1e1c6745191d..77c041d84a94 100644
--- a/tests/v1/spec_decode/test_max_len.py
+++ b/tests/v1/spec_decode/test_max_len.py
@@ -6,6 +6,7 @@
 
 from tests.utils import get_attn_backend_list_based_on_platform
 from vllm import LLM, SamplingParams
+from vllm.config import ModelConfig, ParallelConfig, SpeculativeConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import StructuredOutputsParams
 
@@ -77,3 +78,23 @@ def test_eagle_max_len(
             "is longer than the eagle max length"
         )
         assert o.outputs[0].text == "a b c d e " * 15
+
+
+@pytest.mark.parametrize("spec_max_model_len", [80, 150])
+def test_mtp_speculative_config_max_model_len(spec_max_model_len: int):
+    """Regression test for #41456: max_model_len in speculative config
+    should be respected for the draft model."""
+    model_config = ModelConfig(
+        model="XiaomiMiMo/MiMo-7B-Base",
+        runner="generate",
+        max_model_len=200,
+        trust_remote_code=True,
+    )
+    spec_config = SpeculativeConfig(
+        target_model_config=model_config,
+        target_parallel_config=ParallelConfig(),
+        method="mtp",
+        num_speculative_tokens=1,
+        max_model_len=spec_max_model_len,
+    )
+    assert spec_config.draft_model_config.max_model_len == spec_max_model_len
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 0de443858c98..224497b862ae 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from dataclasses import dataclass
 from types import SimpleNamespace
 
 import numpy as np
 import pytest
 import torch
 
+import vllm.v1.worker.gpu_model_runner as gpu_model_runner_module
 from vllm.config import (
     AttentionConfig,
     CacheConfig,
@@ -22,7 +24,9 @@
 )
 from vllm.model_executor.layers.attention import Attention
 from vllm.model_executor.layers.mamba.mamba_mixer2 import MambaMixer2
+from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
+from vllm.platforms.interface import Platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.system_utils import update_environment_variables
@@ -37,10 +41,15 @@
     KVCacheConfig,
     KVCacheGroupSpec,
     KVCacheTensor,
+    MambaSpec,
+    MemoryModel,
 )
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import InputBatch
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.kv_connector_model_runner_mixin import (
+    KVConnectorModelRunnerMixin,
+)
 from vllm.v1.worker.utils import AttentionGroup, select_common_block_size
 
 BLOCK_SIZE = 16
@@ -48,6 +57,134 @@
 DEVICE_TYPE = current_platform.device_type
 
 
+@dataclass(frozen=True)
+class _RequestConstantMambaSpec(MambaSpec):
+    """Test-only Mamba-like request-constant spec for worker reshape paths."""
+
+    @property
+    def memory_model(self) -> MemoryModel:
+        return MemoryModel.REQUEST_CONSTANT
+
+
+@dataclass(frozen=True, kw_only=True)
+class _RequestConstantFullAttentionSpec(FullAttentionSpec):
+    """Test-only invalid attention spec used to verify fail-closed guards."""
+
+    @property
+    def memory_model(self) -> MemoryModel:
+        return MemoryModel.REQUEST_CONSTANT
+
+
+class _TestAttentionBackend:
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, int, int, int, int]:
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def get_kv_cache_stride_order():
+        return tuple(range(5))
+
+
+class _HybridBlockSizeTestBackend:
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int]:
+        return [16, 32, 64]
+
+    @staticmethod
+    def get_name() -> str:
+        return "HYBRID_BLOCK_SIZE_TEST"
+
+
+class _HybridBlockSizeTestModel:
+    @staticmethod
+    def get_mamba_state_shape_from_config(vllm_config):
+        return ((130,),)
+
+    @staticmethod
+    def get_mamba_state_dtype_from_config(vllm_config):
+        return (torch.float32,)
+
+
+class _HybridBlockSizeTestModelConfig:
+    dtype = torch.float16
+    use_mla = False
+    architecture = "HybridBlockSizeTestModel"
+
+    def get_num_kv_heads(self, parallel_config):
+        return 1
+
+    def get_head_size(self):
+        return 1
+
+    def get_mamba_chunk_size(self):
+        return 16
+
+
+def _make_hybrid_block_size_test_config(
+    *,
+    mamba_cache_mode: str,
+    block_size: int = 16,
+    mamba_page_size_padded: int | None = None,
+):
+    cache_config = CacheConfig(
+        block_size=block_size,
+        cache_dtype="auto",
+        mamba_cache_mode=mamba_cache_mode,
+    )
+    cache_config.mamba_page_size_padded = mamba_page_size_padded
+    return SimpleNamespace(
+        cache_config=cache_config,
+        model_config=_HybridBlockSizeTestModelConfig(),
+        parallel_config=ParallelConfig(),
+    )
+
+
+def _patch_hybrid_block_size_test_model(monkeypatch):
+    def resolve_model_cls(*args, **kwargs):
+        return _HybridBlockSizeTestModel, None
+
+    monkeypatch.setattr(ModelRegistry, "resolve_model_cls", resolve_model_cls)
+
+
+def _reshape_kv_cache_tensor_for_test(
+    kv_cache_spec,
+    raw_tensor: torch.Tensor,
+    layer_name: str = "layer.0",
+):
+    group = AttentionGroup(
+        _TestAttentionBackend,
+        [layer_name],
+        kv_cache_spec,
+        0,
+    )
+    runner_stub = SimpleNamespace(
+        runner_only_attn_layers=set(),
+        cache_config=SimpleNamespace(cache_dtype="auto"),
+        _kv_cache_spec_attn_group_iterator=lambda: iter([group]),
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=1,
+        kv_cache_tensors=[
+            KVCacheTensor(size=raw_tensor.numel(), shared_by=[layer_name]),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(layer_names=[layer_name], kv_cache_spec=kv_cache_spec)
+        ],
+    )
+    return GPUModelRunner._reshape_kv_cache_tensors(
+        runner_stub,
+        kv_cache_config,
+        {layer_name: raw_tensor},
+        [kv_cache_spec.block_size],
+    )
+
+
 def initialize_kv_cache(runner: GPUModelRunner):
     """
     Only perform necessary steps in GPUModelRunner.initialize_kv_cache()
@@ -100,6 +237,7 @@ def get_vllm_config():
         block_size=BLOCK_SIZE,
         gpu_memory_utilization=0.9,
         cache_dtype="auto",
+        mamba_cache_mode="all",
     )
     parallel_config = ParallelConfig()
     vllm_config = VllmConfig(
@@ -162,6 +300,34 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
     )
 
 
+def _schedule_cached_requests(
+    req_ids: list[str],
+    num_scheduled_tokens: dict[str, int],
+    new_token_ids: list[list[int]],
+    num_computed_tokens: list[int],
+    num_output_tokens: list[int],
+) -> SchedulerOutput:
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=CachedRequestData(
+            req_ids=req_ids,
+            resumed_req_ids=set(),
+            new_token_ids=new_token_ids,
+            all_token_ids={},
+            new_block_ids=[None] * len(req_ids),
+            num_computed_tokens=num_computed_tokens,
+            num_output_tokens=num_output_tokens,
+        ),
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=sum(num_scheduled_tokens.values()),
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=[],
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+    )
+
+
 def _is_req_scheduled(model_runner, req_id: str) -> bool:
     return req_id in model_runner.input_batch.req_id_to_index
 
@@ -219,6 +385,58 @@ def test_select_common_block_size_uses_largest_shared_int():
     assert selected_size == 64
 
 
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize(
+    ("world_size", "is_last_rank", "expected_calls"),
+    [(1, True, 0), (2, True, 0), (2, False, 1)],
+)
+def test_sample_tokens_receives_pp_sampled_ids_only_on_non_last_rank(
+    monkeypatch: pytest.MonkeyPatch,
+    world_size: int,
+    is_last_rank: bool,
+    expected_calls: int,
+):
+    runner = GPUModelRunner.__new__(GPUModelRunner)
+    runner.execute_model_state = None
+    runner.kv_connector_output = None
+    runner.use_async_scheduling = True
+    receive_calls = 0
+
+    def receive_prev_sampled_token_ids():
+        nonlocal receive_calls
+        receive_calls += 1
+
+    runner._pp_receive_prev_sampled_token_ids_to_input_batch = (
+        receive_prev_sampled_token_ids
+    )
+    monkeypatch.setattr(
+        gpu_model_runner_module,
+        "get_pp_group",
+        lambda: SimpleNamespace(world_size=world_size, is_last_rank=is_last_rank),
+    )
+
+    assert GPUModelRunner.sample_tokens(runner, None) is None
+    assert receive_calls == expected_calls
+
+
+@pytest.mark.skip_global_cleanup
+def test_sample_tokens_skips_pp_group_lookup_without_async_scheduling(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    runner = GPUModelRunner.__new__(GPUModelRunner)
+    runner.execute_model_state = None
+    runner.kv_connector_output = None
+    runner.use_async_scheduling = False
+
+    monkeypatch.setattr(
+        gpu_model_runner_module,
+        "get_pp_group",
+        pytest.fail,
+    )
+
+    assert GPUModelRunner.sample_tokens(runner, None) is None
+
+
 def test_select_common_block_size_no_valid_option():
     backend_a = _make_mock_backend_for_kernel_block_size([64])
     backend_b = _make_mock_backend_for_kernel_block_size([MultipleOf(16)])
@@ -457,6 +675,135 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
     assert not _is_req_scheduled(model_runner, req_ids[1])
 
 
+def test_update_states_pp_non_async_multi_request_keeps_token_buffers_consistent(
+    model_runner, model_runner_2, dist_init, monkeypatch
+):
+    req_ids = ["req_0", "req_1"]
+    non_last_runner = model_runner
+    last_runner = model_runner_2
+    non_last_runner.use_async_scheduling = False
+    last_runner.use_async_scheduling = False
+
+    # Both ranks start from the same request set.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(_schedule_new_request(*req_ids))
+    last_runner._update_states(_schedule_new_request(*req_ids))
+
+    sampled_by_last_rank = {req_ids[0]: 101, req_ids[1]: 201}
+    # Emulate last-rank bookkeeping result from previous step:
+    # sampled tokens already cached in CPU token buffers.
+    for req_id, token_id in sampled_by_last_rank.items():
+        req_index = last_runner.input_batch.req_id_to_index[req_id]
+        start_idx = int(last_runner.input_batch.num_tokens_no_spec[req_index])
+        end_idx = start_idx + 1
+        last_runner.input_batch.token_ids_cpu[req_index, start_idx:end_idx] = [token_id]
+        last_runner.input_batch.is_token_ids[req_index, start_idx:end_idx] = True
+        last_runner.input_batch.num_tokens_no_spec[req_index] = end_idx
+        last_runner.requests[req_id].output_token_ids.append(token_id)
+
+    scheduler_output = _schedule_cached_requests(
+        req_ids=req_ids,
+        num_scheduled_tokens={req_ids[0]: 1, req_ids[1]: 1},
+        new_token_ids=[[101], [201]],
+        num_computed_tokens=[3, 3],  # prompt tokens only
+        num_output_tokens=[1, 1],
+    )
+    # non-last rank appends new_token_ids in _update_states.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(scheduler_output)
+    # last rank should keep its already-bookkept CPU buffers unchanged.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=True, world_size=2),
+    )
+    last_runner._update_states(scheduler_output)
+
+    # Verify consistency between PP ranks after _update_states.
+    for req_id in req_ids:
+        non_last_idx = non_last_runner.input_batch.req_id_to_index[req_id]
+        last_idx = last_runner.input_batch.req_id_to_index[req_id]
+        non_last_len = int(non_last_runner.input_batch.num_tokens_no_spec[non_last_idx])
+        last_len = int(last_runner.input_batch.num_tokens_no_spec[last_idx])
+        assert non_last_len == last_len
+        assert (
+            non_last_runner.input_batch.token_ids_cpu[
+                non_last_idx, :non_last_len
+            ].tolist()
+            == last_runner.input_batch.token_ids_cpu[last_idx, :last_len].tolist()
+        )
+
+
+def test_update_states_pp_async_multi_request_keeps_rank_state_consistent(
+    model_runner, model_runner_2, dist_init, monkeypatch
+):
+    req_ids = ["req_0", "req_1"]
+    non_last_runner = model_runner
+    last_runner = model_runner_2
+    non_last_runner.use_async_scheduling = True
+    last_runner.use_async_scheduling = True
+
+    # Both ranks start from the same request set.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(_schedule_new_request(*req_ids))
+    last_runner._update_states(_schedule_new_request(*req_ids))
+
+    # Simulate async previous-step sampled tokens known on both ranks.
+    # non-last rank may receive them via PP communication; last rank has
+    # them from local sampling/bookkeeping.
+    sampled_by_last_rank = {req_ids[0]: 111, req_ids[1]: 222}
+    for runner in (non_last_runner, last_runner):
+        for req_id, token_id in sampled_by_last_rank.items():
+            req_index = runner.input_batch.req_id_to_index[req_id]
+            start_idx = int(runner.input_batch.num_tokens_no_spec[req_index])
+            end_idx = start_idx + 1
+            runner.input_batch.token_ids_cpu[req_index, start_idx:end_idx] = [token_id]
+            runner.input_batch.is_token_ids[req_index, start_idx:end_idx] = True
+            runner.input_batch.num_tokens_no_spec[req_index] = end_idx
+            runner.requests[req_id].output_token_ids.append(token_id)
+
+    scheduler_output = _schedule_cached_requests(
+        req_ids=req_ids,
+        num_scheduled_tokens={req_ids[0]: 1, req_ids[1]: 1},
+        new_token_ids=[],
+        num_computed_tokens=[4, 4],
+        num_output_tokens=[1, 1],
+    )
+    # non-last rank: async PP branch (new_token_ids empty).
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=False, world_size=2),
+    )
+    non_last_runner._update_states(scheduler_output)
+    # last rank: keep already-bookkept state aligned with scheduler view.
+    monkeypatch.setattr(
+        "vllm.v1.worker.gpu_model_runner.get_pp_group",
+        lambda: SimpleNamespace(is_last_rank=True, world_size=2),
+    )
+    last_runner._update_states(scheduler_output)
+
+    for req_id in req_ids:
+        non_last_idx = non_last_runner.input_batch.req_id_to_index[req_id]
+        last_idx = last_runner.input_batch.req_id_to_index[req_id]
+        non_last_len = int(non_last_runner.input_batch.num_tokens_no_spec[non_last_idx])
+        last_len = int(last_runner.input_batch.num_tokens_no_spec[last_idx])
+        assert non_last_len == last_len
+        assert (
+            non_last_runner.input_batch.token_ids_cpu[
+                non_last_idx, :non_last_len
+            ].tolist()
+            == last_runner.input_batch.token_ids_cpu[last_idx, :last_len].tolist()
+        )
+
+
 def test_kv_cache_stride_order(monkeypatch, model_runner):
     # This test checks if GPUModelRunner initializes correctly when an attention
     # backend enforces a non-default KV cache stride order.
@@ -977,6 +1324,179 @@ def test_update_hybrid_attention_mamba_layout_with_num_block_2_rewrites_stride()
         which was ambiguous before get_kv_cache_block_dim was used"""
 
 
+@pytest.mark.parametrize(
+    (
+        "mamba_cache_mode",
+        "initial_mamba_block_size",
+        "initial_page_size_padded",
+        "expected_block_size",
+        "expected_mamba_block_size",
+        "expected_page_size_padded",
+    ),
+    [
+        ("none", None, 999, 16, None, None),
+        ("align", 2048, 999, 16, 16, None),
+        ("all", None, None, 144, 144, 576),
+    ],
+    ids=["request_constant_none", "request_constant_align", "token_proportional_all"],
+)
+def test_request_constant_mamba_and_token_proportional_mamba_all_platform_padding(
+    monkeypatch,
+    mamba_cache_mode,
+    initial_mamba_block_size,
+    initial_page_size_padded,
+    expected_block_size,
+    expected_mamba_block_size,
+    expected_page_size_padded,
+):
+    _patch_hybrid_block_size_test_model(monkeypatch)
+    vllm_config = _make_hybrid_block_size_test_config(
+        mamba_cache_mode=mamba_cache_mode,
+        mamba_page_size_padded=initial_page_size_padded,
+    )
+    vllm_config.cache_config.mamba_block_size = initial_mamba_block_size
+
+    Platform._align_hybrid_block_size(vllm_config, _HybridBlockSizeTestBackend)
+
+    assert vllm_config.cache_config.block_size == expected_block_size
+    assert vllm_config.cache_config.mamba_block_size == expected_mamba_block_size
+    assert vllm_config.cache_config.mamba_page_size_padded == expected_page_size_padded
+
+
+def test_reshape_request_constant_mamba_uses_physical_page():
+    spec = _RequestConstantMambaSpec(
+        block_size=1,
+        shapes=((2,),),
+        dtypes=(torch.float32,),
+        page_size_padded=16,
+    )
+    num_blocks = 3
+    raw_tensor = torch.empty(
+        spec.physical_page_size_bytes * num_blocks,
+        dtype=torch.int8,
+    )
+
+    kv_caches = _reshape_kv_cache_tensor_for_test(spec, raw_tensor, "state")
+
+    assert kv_caches["state"][0].shape == (num_blocks, 2)
+
+
+def test_reshape_request_constant_mamba_stride_matches_physical():
+    dtype = torch.float32
+    spec = _RequestConstantMambaSpec(
+        block_size=1,
+        shapes=((2,),),
+        dtypes=(dtype,),
+        page_size_padded=16,
+    )
+    raw_tensor = torch.empty(
+        spec.physical_page_size_bytes * 3,
+        dtype=torch.int8,
+    )
+
+    kv_caches = _reshape_kv_cache_tensor_for_test(spec, raw_tensor, "state")
+    state_tensor = kv_caches["state"][0]
+    dtype_size = torch.empty((), dtype=dtype).element_size()
+
+    assert state_tensor.stride()[0] == spec.physical_page_size_bytes // dtype_size
+    assert state_tensor.stride()[0] != spec.page_size_bytes // dtype_size
+
+
+def test_reshape_token_proportional_mamba_uses_padded_page():
+    dtype = torch.float32
+    spec = MambaSpec(
+        block_size=1,
+        shapes=((2,),),
+        dtypes=(dtype,),
+        page_size_padded=16,
+        mamba_cache_mode="all",
+    )
+    num_blocks = 3
+    raw_tensor = torch.empty(
+        spec.page_size_bytes * num_blocks,
+        dtype=torch.int8,
+    )
+
+    kv_caches = _reshape_kv_cache_tensor_for_test(spec, raw_tensor, "state")
+    state_tensor = kv_caches["state"][0]
+    dtype_size = torch.empty((), dtype=dtype).element_size()
+
+    assert state_tensor.shape == (num_blocks, 2)
+    assert state_tensor.stride()[0] == spec.page_size_bytes // dtype_size
+
+
+def test_reshape_token_proportional_attention_unchanged():
+    spec = FullAttentionSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        page_size_padded=32,
+    )
+    num_blocks = 2
+    raw_tensor = torch.empty(
+        spec.page_size_bytes * num_blocks,
+        dtype=torch.int8,
+    )
+
+    kv_caches = _reshape_kv_cache_tensor_for_test(spec, raw_tensor, "attn")
+    kv_cache = kv_caches["attn"]
+
+    assert kv_cache.shape[1] == num_blocks
+    assert (
+        kv_cache.numel()
+        == raw_tensor.numel() // torch.empty((), dtype=spec.dtype).element_size()
+    )
+
+
+def test_reshape_attention_request_constant_rejected():
+    spec = _RequestConstantFullAttentionSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    raw_tensor = torch.empty(spec.page_size_bytes, dtype=torch.int8)
+
+    with pytest.raises(NotImplementedError, match="REQUEST_CONSTANT AttentionSpec"):
+        _reshape_kv_cache_tensor_for_test(spec, raw_tensor, "attn")
+
+
+def test_reshape_connector_request_constant_rejected():
+    spec = _RequestConstantFullAttentionSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=1,
+        kv_cache_tensors=[
+            KVCacheTensor(size=spec.page_size_bytes, shared_by=["attn"]),
+        ],
+        kv_cache_groups=[KVCacheGroupSpec(layer_names=["attn"], kv_cache_spec=spec)],
+    )
+    attn_groups = [
+        [
+            AttentionGroup(
+                _TestAttentionBackend,
+                ["attn"],
+                spec,
+                0,
+            )
+        ]
+    ]
+
+    with pytest.raises(NotImplementedError, match="Cross-layer KV connector"):
+        KVConnectorModelRunnerMixin.allocate_uniform_kv_caches(
+            kv_cache_config,
+            attn_groups,
+            "auto",
+            torch.device("cpu"),
+            [spec.block_size],
+        )
+
+
 def test_hybrid_block_table_initialization():
     """Test hybrid block table with different kernel and kvcache_manager block
     sizes."""
diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py
index 73ef8b915821..c0503fd69712 100644
--- a/tools/pre_commit/generate_attention_backend_docs.py
+++ b/tools/pre_commit/generate_attention_backend_docs.py
@@ -810,6 +810,9 @@ def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None
         "compute_capability": compute_cap,
         "is_mla": is_mla_backend or check_method_overrides(class_node, "is_mla"),
         "supports_sink": check_method_overrides(class_node, "supports_sink"),
+        "supports_non_causal": check_method_overrides(
+            class_node, "supports_non_causal"
+        ),
         "is_sparse": check_method_overrides(class_node, "is_sparse"),
         "supports_mm_prefix": check_method_overrides(class_node, "supports_mm_prefix"),
         "supports_dcp": supports_dcp,
@@ -1311,6 +1314,10 @@ def _extract_priorities(body: list, priorities: dict[str, list[str]], prefix: st
 _COL_BLOCK_SIZES: TableColumn = ("Block Sizes", lambda b: b["block_sizes"])
 _COL_HEAD_SIZES: TableColumn = ("Head Sizes", lambda b: b["head_sizes"])
 _COL_SINK: TableColumn = ("Sink", lambda b: bool_to_emoji(b["supports_sink"]))
+_COL_NON_CAUSAL: TableColumn = (
+    "Non-Causal",
+    lambda b: bool_to_emoji(b["supports_non_causal"]),
+)
 _COL_SPARSE: TableColumn = ("Sparse", lambda b: bool_to_emoji(b["is_sparse"]))
 _COL_MM_PREFIX: TableColumn = (
     "MM Prefix",
@@ -1344,6 +1351,7 @@ def _build_columns(is_mla: bool, has_versions: bool) -> list[TableColumn]:
         cols.append(_COL_VERSION)
     cols.extend([_COL_DTYPES, _COL_KV_DTYPES, _COL_BLOCK_SIZES, _COL_HEAD_SIZES])
     cols.append(_COL_SINK)
+    cols.append(_COL_NON_CAUSAL)
     if is_mla:
         cols.append(_COL_SPARSE)
     cols.extend([_COL_MM_PREFIX, _COL_DCP, _COL_ATTN_TYPES, _COL_COMPUTE_CAP])
@@ -1554,6 +1562,7 @@ def generate_legend() -> str:
 | **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) |
 | **Head Sizes** | Supported attention head sizes |
 | **Sink** | Attention sink support (for StreamingLLM) |
+| **Non-Causal** | Non-causal (bidirectional) attention support for decoder models |
 | **Sparse** | Sparse attention support (MLA only) |
 | **MM Prefix** | Multimodal prefix full attention support |
 | **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) |
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index b11fc21975ca..8459b20fae99 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -647,58 +647,6 @@ def _rocm_aiter_gemm_a8w8_blockscale_fake(
     return Y
 
 
-def _rocm_aiter_rms_norm_impl(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    from aiter import rms_norm
-
-    if x.dim() > 2:
-        x_original_shape = x.shape
-        x = x.reshape(-1, x_original_shape[-1])
-        x = rms_norm(x, weight, variance_epsilon)
-        return x.reshape(x_original_shape)
-
-    return rms_norm(x, weight, variance_epsilon)
-
-
-def _rocm_aiter_rms_norm_fake(
-    x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-) -> torch.Tensor:
-    return torch.empty_like(x)
-
-
-def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from aiter import rmsnorm2d_fwd_with_add
-
-    residual_out = torch.empty_like(residual)
-    out = torch.empty_like(x)
-    rmsnorm2d_fwd_with_add(
-        out,  # output
-        x,  # input
-        residual,  # residual input
-        residual_out,  # residual output
-        weight,
-        variance_epsilon,
-    )
-    return out, residual_out
-
-
-def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    residual_out = torch.empty_like(residual)
-    out = torch.empty_like(x)
-    return out, residual_out
-
-
 def _rocm_aiter_rmsnorm_fused_add_dynamic_quant_impl(
     x: torch.Tensor,
     residual: torch.Tensor,
@@ -1229,10 +1177,9 @@ class rocm_aiter_ops:
 
         # Check if aiter is enabled before using operations
         if rocm_aiter_ops.is_enabled():
-            result = rocm_aiter_ops.rms_norm(x, weight, epsilon)
+            result = rocm_aiter_ops.per_token_quant(x, FP8_DTYPE)
 
     Operations:
-        - RMS normalization: rms_norm, rms_norm2d_with_add
         - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale
         - Fused MoE: fused_moe, asm_moe_tkw1
         - Routing: topk_softmax, biased_grouped_topk, grouped_topk
@@ -1244,7 +1191,6 @@ class rocm_aiter_ops:
     # Check if the env variable is set
     _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
     _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
-    _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
     _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
     _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
     _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
@@ -1275,7 +1221,6 @@ def refresh_env_variables(cls):
         """
         cls._AITER_ENABLED = envs.VLLM_ROCM_USE_AITER
         cls._LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR
-        cls._RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM
         cls._FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE
         cls._MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA
         cls._MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA
@@ -1367,11 +1312,6 @@ def is_linear_enabled(cls) -> bool:
     def is_linear_fp8_enabled(cls) -> bool:
         return cls.is_linear_enabled()
 
-    @classmethod
-    @if_aiter_supported
-    def is_rmsnorm_enabled(cls) -> bool:
-        return cls._AITER_ENABLED and cls._RMSNORM_ENABLED
-
     @classmethod
     @if_aiter_supported
     def is_fused_moe_enabled(cls) -> bool:
@@ -1467,6 +1407,25 @@ def get_aiter_allreduce_max_size(cls) -> int | None:
         # https://github.com/ROCm/aiter/blob/6a0e7b26ccf33164785531212cc2ec2cde0b9243/aiter/dist/device_communicators/custom_all_reduce.py#L272-L273
         return int(cls._ALL_REDUCE_MAX_SIZE / 2)
 
+    @classmethod
+    @if_aiter_supported
+    def are_gdn_triton_kernels_available(cls) -> bool:
+        """Check if AITER Triton kernels for GDN attention are importable.
+
+        These are optional Triton kernels (conv1d fast-path, gated delta net)
+        used by GatedDeltaNetAttention's decode fast-path.  They may be absent
+        in older aiter builds.
+        """
+        if not cls._AITER_ENABLED:
+            return False
+        try:
+            import aiter.ops.triton.causal_conv1d_update_single_token  # noqa: F401
+            import aiter.ops.triton.gated_delta_net  # noqa: F401
+
+            return True
+        except (ImportError, ModuleNotFoundError):
+            return False
+
     @staticmethod
     @if_aiter_supported
     def register_ops_once() -> None:
@@ -1560,19 +1519,6 @@ def register_ops_once() -> None:
                 fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake,
             )
 
-            direct_register_custom_op(
-                op_name="rocm_aiter_rms_norm",
-                op_func=_rocm_aiter_rms_norm_impl,
-                fake_impl=_rocm_aiter_rms_norm_fake,
-            )
-
-            direct_register_custom_op(
-                op_name="rocm_aiter_rmsnorm2d_fwd_with_add",
-                op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
-                fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
-                dispatch_key=current_platform.dispatch_key,
-            )
-
             direct_register_custom_op(
                 op_name="rocm_aiter_rmsnorm_fused_dynamic_quant",
                 op_func=_rocm_aiter_rmsnorm_fused_dynamic_quant_impl,
@@ -1672,14 +1618,6 @@ def register_ops_once() -> None:
 
             _OPS_REGISTERED = True
 
-    @staticmethod
-    def get_rmsnorm_fused_add_op() -> OpOverload:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default
-
-    @staticmethod
-    def get_rmsnorm_op() -> OpOverload:
-        return torch.ops.vllm.rocm_aiter_rms_norm.default
-
     @staticmethod
     def get_rmsnorm_fused_add_dynamic_quant_op() -> OpOverload:
         return torch.ops.vllm.rocm_aiter_rmsnorm_fused_add_dynamic_quant.default
@@ -1724,23 +1662,6 @@ def get_fused_allreduce_rmsnorm_op() -> OpOverload:
     def get_fused_mla_dual_rms_norm_op() -> OpOverload:
         return torch.ops.vllm.fused_mla_dual_rms_norm.default
 
-    @staticmethod
-    def rms_norm(
-        x: torch.Tensor, weight: torch.Tensor, variance_epsilon: float
-    ) -> torch.Tensor:
-        return torch.ops.vllm.rocm_aiter_rms_norm(x, weight, variance_epsilon)
-
-    @staticmethod
-    def rms_norm2d_with_add(
-        x: torch.Tensor,
-        residual: torch.Tensor,
-        weight: torch.Tensor,
-        variance_epsilon: float,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add(
-            x, residual, weight, variance_epsilon
-        )
-
     @staticmethod
     def w8a8_gemm(
         A: torch.Tensor,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index ffea35de8811..564276adaaf1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from enum import IntEnum
 from typing import TYPE_CHECKING, Literal
 
 import torch
@@ -3150,6 +3151,13 @@ def weight_packed_linear_fake(
         )
 
 
+class CPUQuantMethod(IntEnum):
+    UNQUANT = 0
+    INT8_W8A8 = 1
+    FP8_W8A16 = 2
+    INT4_W4A8 = 3
+
+
 if hasattr(torch.ops._C, "fused_experts_cpu"):
 
     @register_fake("_C::fused_experts_cpu")
@@ -3160,18 +3168,49 @@ def fused_experts_cpu_fake(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         inplace: bool,
-        use_int8_w8a8: bool,
-        use_fp8_w8a16: bool,
+        moe_comp_method: CPUQuantMethod,
         w1_scale: torch.Tensor | None,
         w2_scale: torch.Tensor | None,
+        w1_zero: torch.Tensor | None,
+        w2_zero: torch.Tensor | None,
         block_size: list[int] | None,
-        a1_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
         is_vnni: bool,
     ) -> torch.Tensor:
         return torch.empty_like(hidden_states)
 
 
+def fused_experts_cpu(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool,
+    moe_comp_method: CPUQuantMethod,
+    w1_scale: torch.Tensor | None,
+    w2_scale: torch.Tensor | None,
+    w1_zero: torch.Tensor | None,
+    w2_zero: torch.Tensor | None,
+    block_size: list[int] | None,
+    is_vnni: bool,
+) -> torch.Tensor:
+    return torch.ops._C.fused_experts_cpu(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        inplace,
+        moe_comp_method,
+        w1_scale,
+        w2_scale,
+        w1_zero,
+        w2_zero,
+        block_size,
+        is_vnni,
+    )
+
+
 if hasattr(torch.ops._C, "int8_scaled_mm_with_quant"):
 
     @register_fake("_C::int8_scaled_mm_with_quant")
@@ -3188,6 +3227,11 @@ def int8_scaled_mm_with_quant_fake(
         return torch.empty((M, N), dtype=out_dtype)
 
 
+class CPUQuantAlgo(IntEnum):
+    AWQ = 0
+    GPTQ = 1
+
+
 if hasattr(torch.ops._C, "convert_weight_packed_scale_zp"):
 
     @register_fake("_C::convert_weight_packed_scale_zp")
@@ -3195,6 +3239,7 @@ def convert_weight_packed_scale_zp_fake(
         qweight: torch.Tensor,
         qzeros: torch.Tensor,
         scales: torch.Tensor,
+        quant_method_4bit: CPUQuantAlgo,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         return (
             torch.empty_like(qweight),
@@ -3203,6 +3248,20 @@ def convert_weight_packed_scale_zp_fake(
         )
 
 
+def convert_weight_packed_scale_zp(
+    qweight: torch.Tensor,
+    qzeros: torch.Tensor,
+    scales: torch.Tensor,
+    quant_method_4bit: CPUQuantAlgo,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    return torch.ops._C.convert_weight_packed_scale_zp(
+        qweight,
+        qzeros,
+        scales,
+        quant_method_4bit,
+    )
+
+
 if hasattr(torch.ops._C, "int4_scaled_mm_cpu"):
 
     @register_fake("_C::int4_scaled_mm_cpu")
@@ -3217,7 +3276,59 @@ def int4_scaled_mm_cpu_fake(
         return torch.empty((x.size(0), N), dtype=x.dtype, device=x.device)
 
 
-_supports_cpu_w4a8_int8 = bool(hasattr(torch.ops._C, "convert_weight_packed_scale_zp"))
+def int4_scaled_mm_cpu(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    w_zeros: torch.Tensor,
+    w_scales: torch.Tensor,
+    bias: torch.Tensor | None,
+) -> torch.Tensor:
+    x_shape = x.shape
+    x_2d = x.reshape(-1, x_shape[-1]) if len(x_shape) > 2 else x
+
+    out = torch.ops._C.int4_scaled_mm_cpu(
+        x_2d,
+        w,
+        w_zeros,
+        w_scales,
+        bias,
+    )
+    out = out.reshape(x_shape[:-1] + (out.size(-1),)) if len(x_shape) > 2 else out
+    return out
+
+
+if hasattr(torch.ops._C, "fp8_scaled_mm_cpu"):
+
+    @register_fake("_C::fp8_scaled_mm_cpu")
+    def fp8_scaled_mm_cpu_fake(
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+        scales2: torch.Tensor,
+        block_size: list[int],
+        bias: torch.Tensor | None,
+        out_dtype: torch.dtype,
+        is_vnni: bool,
+    ) -> torch.Tensor:
+        M = mat1.size(0)
+        N = mat2.size(0)
+        return torch.empty((M, N), dtype=out_dtype, device=mat1.device)
+
+
+_supports_cpu_fp8_w8a16 = bool(hasattr(torch.ops._C, "fp8_scaled_mm_cpu"))
+
+
+def fp8_scaled_mm_cpu(
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    scales2: torch.Tensor,
+    block_size: list[int],
+    bias: torch.Tensor | None,
+    out_dtype: torch.dtype,
+    is_vnni: bool,
+) -> torch.Tensor:
+    return torch.ops._C.fp8_scaled_mm_cpu(
+        mat1, mat2, scales2, block_size, bias, out_dtype, is_vnni
+    )
 
 
 class CPUDNNLGEMMHandler:
diff --git a/vllm/_oink_ops.py b/vllm/_oink_ops.py
deleted file mode 100644
index c7a055410b71..000000000000
--- a/vllm/_oink_ops.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Small helper wrappers for external Oink Blackwell custom ops.
-
-vLLM does not depend on the external Oink repository/package. When an external
-plugin registers torch.library.custom_op entrypoints under the `oink::`
-namespace (e.g. via vLLM's general_plugins mechanism) and
-`VLLM_USE_OINK_OPS=1` is set, vLLM can route eligible calls to those ops.
-
-This module provides:
-- A single place to probe Oink op availability at module init time
-  (outside torch.compile tracing), and
-- Thin wrappers around the torch.ops entrypoints for use in CUDA fast paths,
-  without introducing graph breaks.
-
-Important:
-  Do not call the availability helpers in a compiled region. They may call
-  functions decorated with `torch._dynamo.disable` to safely check
-  conditions that should not be traced.
-"""
-
-from __future__ import annotations
-
-from collections.abc import Callable
-
-import torch
-
-try:
-    from torch._dynamo import disable as _dynamo_disable  # type: ignore[attr-defined]
-except Exception:  # pragma: no cover
-
-    def _dynamo_disable(fn: Callable):  # type: ignore[misc]
-        return fn
-
-
-def _has_oink_op(op_name: str) -> bool:
-    """Check if a specific oink op is registered."""
-    return hasattr(torch.ops, "oink") and hasattr(torch.ops.oink, op_name)
-
-
-@_dynamo_disable
-def is_oink_available_for_device(device_index: int) -> bool:
-    """Return True if Oink ops are registered and device is SM100+.
-
-    This function is intended to be called during module initialization
-    (e.g., in RMSNorm.__init__), not in the forward path.
-
-    External plugins are expected to gate registration on SM100+ and
-    VLLM_USE_OINK_OPS=1, so if the ops are present they should be usable.
-    """
-    if not torch.cuda.is_available():
-        return False
-
-    try:
-        major, minor = torch.cuda.get_device_capability(device_index)
-        sm = 10 * major + minor
-        if sm < 100:
-            return False
-    except Exception:
-        return False
-
-    return _has_oink_op("rmsnorm")
-
-
-def has_fused_add_rms_norm() -> bool:
-    """Return True if the in-place fused op is registered."""
-    return _has_oink_op("fused_add_rms_norm")
-
-
-def rmsnorm(x: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
-    """Call `torch.ops.oink.rmsnorm`.
-
-    This wrapper is safe to call in torch.compile regions.
-    """
-    return torch.ops.oink.rmsnorm(x, weight, eps)
-
-
-def fused_add_rms_norm_(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-) -> None:
-    """Call `torch.ops.oink.fused_add_rms_norm` (mutates x and residual)."""
-    torch.ops.oink.fused_add_rms_norm(x, residual, weight, eps)
-
-
-def fused_add_rms_norm(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """Convenience wrapper returning (x, residual) after in-place mutation."""
-    fused_add_rms_norm_(x, residual, weight, eps)
-    return x, residual
diff --git a/vllm/_xpu_ops.py b/vllm/_xpu_ops.py
index 0b39a4000126..09f700d0de70 100644
--- a/vllm/_xpu_ops.py
+++ b/vllm/_xpu_ops.py
@@ -185,6 +185,35 @@ def _xpu_ops_deepseek_scaling_rope_fake(
     return query, key
 
 
+def _topk_topp_sample_impl(
+    random_sampled: torch.Tensor,
+    logits_to_return: torch.Tensor | None,
+    logits: torch.Tensor,
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
+    logprobs_mode: str,
+    seeds: torch.Tensor | None,
+    lambda_: float = 1.0,
+) -> None:
+    torch.ops._xpu_C.topk_topp_sampler(
+        random_sampled, logits_to_return, logits, k, p, logprobs_mode, seeds, lambda_
+    )
+    return
+
+
+def _topk_topp_sample_fake(
+    random_sampled: torch.Tensor,
+    logits_to_return: torch.Tensor | None,
+    logits: torch.Tensor,
+    k: torch.Tensor | None,
+    p: torch.Tensor | None,
+    logprobs_mode: str,
+    seeds: torch.Tensor | None,
+    lambda_: float = 1.0,
+) -> None:
+    return
+
+
 def _xpu_mxfp8_quantize_impl(
     x: torch.Tensor, dtype: torch.dtype | None = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -691,6 +720,12 @@ def register_ops_once() -> None:
                 fake_impl=_gdn_attention_core_xpu_fake,
             )
 
+            direct_register_custom_op(
+                op_name="xpu_topk_topp_sampler",
+                op_func=_topk_topp_sample_impl,
+                fake_impl=_topk_topp_sample_fake,
+            )
+
             _OPS_REGISTERED = True
 
 
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index f5e443db978f..9ec2e4d16770 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -139,6 +139,6 @@ def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
         """
         Read audio data from the video asset, used in Qwen2.5-Omni examples.
 
-        See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
+        See also: examples/generate/multimodal/qwen2_5_omni/only_thinker.py
         """
         return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/benchmarks/datasets/datasets.py b/vllm/benchmarks/datasets/datasets.py
index 419275d2e6ae..b032c0a0d613 100644
--- a/vllm/benchmarks/datasets/datasets.py
+++ b/vllm/benchmarks/datasets/datasets.py
@@ -1803,7 +1803,9 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
 
     if args.dataset_name == "custom":
         dataset = CustomDataset(
-            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+            dataset_path=args.dataset_path,
+            disable_shuffle=args.disable_shuffle,
+            random_seed=args.seed,
         )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
@@ -1816,7 +1818,9 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]:
 
     elif args.dataset_name == "custom_mm":
         dataset = CustomMMDataset(
-            dataset_path=args.dataset_path, disable_shuffle=args.disable_shuffle
+            dataset_path=args.dataset_path,
+            disable_shuffle=args.disable_shuffle,
+            random_seed=args.seed,
         )
         input_requests = dataset.sample(
             num_requests=args.num_prompts,
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index dc8b293a425b..2c39e7d1878f 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -128,26 +128,30 @@ async def fetch_spec_decode_metrics(
                     continue
 
                 if line.startswith("vllm:spec_decode"):
+                    # Extract metric name (before labels) to avoid matching
+                    # substrings inside label values.
+                    parts = line.split(None, 1)
+                    metric_name = parts[0].split("{")[0]
+                    if not metric_name.endswith("_total"):
+                        continue
                     found_spec_decode = True
-                    parts = line.split()
-                    if parts:
-                        with contextlib.suppress(ValueError):
-                            if "num_drafts" in line:
-                                num_drafts += int(float(parts[-1]))
-                            elif "num_draft_tokens" in line:
-                                num_draft_tokens += int(float(parts[-1]))
-                            elif "num_accepted_tokens_per_pos" in line:
-                                pos_label = 'position="'
-                                if pos_label in line:
-                                    start = line.index(pos_label) + len(pos_label)
-                                    end = line.index('"', start)
-                                    pos = int(line[start:end])
-                                    val = int(float(parts[-1]))
-                                    accepted_per_pos[pos] = (
-                                        accepted_per_pos.get(pos, 0) + val
-                                    )
-                            elif "num_accepted_tokens" in line:
-                                num_accepted_tokens += int(float(parts[-1]))
+                    with contextlib.suppress(ValueError):
+                        if "num_drafts" in metric_name:
+                            num_drafts += int(float(parts[-1]))
+                        elif "num_draft_tokens" in metric_name:
+                            num_draft_tokens += int(float(parts[-1]))
+                        elif "num_accepted_tokens_per_pos" in metric_name:
+                            pos_label = 'position="'
+                            if pos_label in line:
+                                start = line.index(pos_label) + len(pos_label)
+                                end = line.index('"', start)
+                                pos = int(line[start:end])
+                                val = int(float(parts[-1]))
+                                accepted_per_pos[pos] = (
+                                    accepted_per_pos.get(pos, 0) + val
+                                )
+                        elif "num_accepted_tokens" in metric_name:
+                            num_accepted_tokens += int(float(parts[-1]))
 
             if not found_spec_decode:
                 return None
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 353567fd96d3..5a67415f1030 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -50,6 +50,7 @@
     should_split,
 )
 from .passes.inductor_pass import InductorPass, pass_context
+from .passes.ir.inplace_functionalization import VllmIRInplaceFunctionalizationPass
 from .passes.pass_manager import PostGradPassManager
 
 logger = init_logger(__name__)
@@ -926,6 +927,24 @@ def collect_standalone_compile_artifacts(
         return standalone_compile_artifacts, sym_shape_indices_map, returns_tuple_map
 
     def configure_post_pass(self) -> None:
+        # TODO proper PassManager?
+        pre_grad_pass_key = "pre_grad_custom_pass"
+        assert self.pass_key != pre_grad_pass_key
+        assert pre_grad_pass_key not in self.inductor_config
+        self.inductor_config[pre_grad_pass_key] = VllmIRInplaceFunctionalizationPass(
+            self.vllm_config
+        )
+
+        # Make sure pre_grad_custom_pass is not pickled
+        # as part of AOTAutograd built-in cache key
+        # TODO(luka) is there a cleaner way to do this
+        import torch._inductor.config as inductor_config
+
+        ignore = inductor_config._cache_config_ignore_prefix + [pre_grad_pass_key]
+        assert "_cache_config_ignore_prefix" not in self.inductor_config
+        self.inductor_config["_cache_config_ignore_prefix"] = ignore
+
+        # Configure the (nominally post-grad) pass manager
         self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
@@ -1249,7 +1268,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             original_split_gm if envs.VLLM_USE_MEGA_AOT_ARTIFACT else self.graph
         )
 
-        execution_code, submod_names = generate_execution_code(self.split_gm)
+        execution_code, submod_names, consts = generate_execution_code(self.split_gm)
         # Use getattr to get correct callables: __dict__ has PiecewiseBackend
         # instances (from PiecewiseCompileInterpreter), _modules has originals.
         # getattr checks __dict__ first, then falls back to _modules.
@@ -1258,7 +1277,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             for name, _ in self.split_gm.named_children()
         }
         runtime_callable = compile_execution_fn(
-            execution_code, submod_callables, submod_names
+            execution_code, submod_callables, submod_names, consts
         )
 
         if (
@@ -1274,6 +1293,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
                 vllm_backend=self,
                 execution_code=execution_code,
                 submod_names=submod_names,
+                consts=consts,
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -1307,4 +1327,5 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             sym_tensor_indices=sym_tensor_indices,
             execution_code=execution_code,
             submod_names=submod_names,
+            consts=consts,
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 81c3d7b28655..62da2d9de35b 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -187,6 +187,7 @@ def __init__(
         aot_autograd_config: dict[str, Any] | None = None,
         execution_code: str | None = None,
         submod_names: list[str] | None = None,
+        consts: list[Any] | None = None,
     ) -> None:
         self.graph_module = graph_module
         self.example_inputs = example_inputs
@@ -198,6 +199,7 @@ def __init__(
         self.sym_tensor_indices = sym_tensor_indices
         self.execution_code = execution_code
         self.submod_names = submod_names
+        self.consts = consts
         self._fake_mode: Any | None = None
 
         import torch._functorch.config as functorch_config
@@ -526,8 +528,9 @@ def reconstruct_serializable_fn_from_mega_artifact(
     execution_code = state.get("execution_code")
     submod_names = state.get("submod_names")
     if execution_code is not None and submod_names is not None:
+        consts = state.get("consts")
         runtime_callable = compile_execution_fn(
-            execution_code, submod_callables, submod_names
+            execution_code, submod_callables, submod_names, consts
         )
     else:
         logger.warning(
diff --git a/vllm/compilation/codegen.py b/vllm/compilation/codegen.py
index 1baad4357648..67e9ac843a64 100644
--- a/vllm/compilation/codegen.py
+++ b/vllm/compilation/codegen.py
@@ -22,11 +22,17 @@ def generate_execution_code_with_name(
     split_gm: torch.fx.GraphModule,
     fn_name: str,
     with_submod: bool,
-) -> tuple[str, list[str]]:
+    consts: list[Any] | None = None,
+    const_index: dict[int, int] | None = None,
+) -> tuple[str, list[str], list[Any]]:
     lines: list[str] = []
     param_names: list[str] = []
     submod_names: list[str] = []
     submod_index: dict[str, int] = {}
+    if consts is None:
+        consts = []
+    if const_index is None:
+        const_index = {}
 
     # Build node ordering for liveness analysis.
     nodes = list(split_gm.graph.nodes)
@@ -48,6 +54,9 @@ def generate_execution_code_with_name(
             continue
         del_after.setdefault(node_order[last_user], []).append(node.name)
 
+    def ref(arg: Any) -> str:
+        return _node_ref(arg, consts, const_index)
+
     for i, node in enumerate(nodes):
         if node.op == "placeholder":
             param_names.append(node.name)
@@ -62,16 +71,18 @@ def generate_execution_code_with_name(
                 submod_index[target] = len(submod_names)
                 submod_names.append(target)
             idx = submod_index[target]
-            args_str = ", ".join(_node_ref(a) for a in node.args)
-            kwargs_str = ", ".join(
-                f"{k}={_node_ref(v)}" for k, v in node.kwargs.items()
-            )
+            args_str = ", ".join(ref(a) for a in node.args)
+            kwargs_str = ", ".join(f"{k}={ref(v)}" for k, v in node.kwargs.items())
             all_args = ", ".join(filter(None, [args_str, kwargs_str]))
             submod = getattr(split_gm, target)
             if isinstance(submod, torch.fx.GraphModule):
                 callable_name = f"__vllm_inlined_submods__{idx}"
-                inlined_code, _ = generate_execution_code_with_name(
-                    submod, callable_name, with_submod=False
+                inlined_code, _, _ = generate_execution_code_with_name(
+                    submod,
+                    callable_name,
+                    with_submod=False,
+                    consts=consts,
+                    const_index=const_index,
                 )
                 inlined_submods.append(inlined_code)
             else:
@@ -80,15 +91,13 @@ def generate_execution_code_with_name(
 
         elif node.op == "call_function":
             if node.target is operator.getitem:
-                source = _node_ref(node.args[0])
+                source = ref(node.args[0])
                 index = node.args[1]
                 assert isinstance(index, int)
                 lines.append(f"    {node.name} = {source}[{index}]")
             else:
-                args_str = ", ".join(_node_ref(a) for a in node.args)
-                kwargs_str = ", ".join(
-                    f"{k}={_node_ref(v)}" for k, v in node.kwargs.items()
-                )
+                args_str = ", ".join(ref(a) for a in node.args)
+                kwargs_str = ", ".join(f"{k}={ref(v)}" for k, v in node.kwargs.items())
                 all_args = ", ".join(filter(None, [args_str, kwargs_str]))
                 lines.append(
                     f"    {node.name} = {_get_qualified_name(node.target)}({all_args})"
@@ -96,7 +105,7 @@ def generate_execution_code_with_name(
 
         elif node.op == "output":
             assert len(node.args) == 1
-            ret = _node_ref(node.args[0])
+            ret = ref(node.args[0])
             lines.append(f"    return {ret}")
 
         else:
@@ -109,22 +118,29 @@ def generate_execution_code_with_name(
 
     assert len(param_names) > 0
     params = ", ".join(param_names)
-    header = (
-        f"\ndef {fn_name}({params}{', *, __vllm_submods__' if with_submod else ''}):"
+    kw_params = ", *, __vllm_submods__" if with_submod else ""
+    header = f"\ndef {fn_name}({params}{kw_params}):"
+    return (
+        "".join(inlined_submods) + "\n".join([header] + lines) + "\n",
+        submod_names,
+        consts,
     )
-    return "".join(inlined_submods) + "\n".join([header] + lines) + "\n", submod_names
 
 
 @dynamo_timed("vllm.generate_execution_code")
 def generate_execution_code(
     split_gm: torch.fx.GraphModule,
-) -> tuple[str, list[str]]:
+) -> tuple[str, list[str], list[Any]]:
     """Generate Python source code from a split_gm's stitching graph.
 
     Walks split_gm.graph.nodes and produces a function that calls
     submodules via a __vllm_submods__ list, avoiding FX GraphModule overhead
     and dict lookup cost.
 
+    Non-primitive constant arguments (e.g. torch.device, DTensor placement
+    types) are collected into a constants list and referenced by index
+    in the generated code, avoiding reliance on repr() being eval-able.
+
     If a submodule is a plain torch.fx.GraphModule, it is inlined directly
     in the generated code and we do not need to serialize it in the artifact.
 
@@ -132,15 +148,17 @@ def generate_execution_code(
         split_gm: The split graph module produced by split_graph().
 
     Returns:
-        A tuple of (code, submod_names) where code is the Python source
-        and submod_names is the ordered list of submodule target names
-        corresponding to list indices used in the generated code.
+        A tuple of (code, submod_names, consts) where code is the Python
+        source, submod_names is the ordered list of submodule target names
+        corresponding to list indices used in the generated code, and
+        consts is a list of non-primitive constant objects referenced
+        by the generated code via __vllm_consts__. These objects are
+        kept alive for the lifetime of the compiled function.
     """
-
-    code, submod_names = generate_execution_code_with_name(
+    code, submod_names, consts = generate_execution_code_with_name(
         split_gm, "execution_fn", with_submod=True
     )
-    return "import torch\nimport operator\n" + code, submod_names
+    return "import torch\nimport operator\n" + code, submod_names, consts
 
 
 @dynamo_timed("vllm.compile_execution_fn")
@@ -148,6 +166,7 @@ def compile_execution_fn(
     code: str,
     submod_callables: dict[str, Callable[..., Any]],
     submod_names: list[str],
+    consts: list[Any] | None = None,
 ) -> Callable[..., Any]:
     """Compile execution code and bind submodule callables.
 
@@ -156,6 +175,9 @@ def compile_execution_fn(
         submod_callables: Mapping of submodule names to their callables.
         submod_names: Ordered list of submodule names matching the indices
             used in the generated code.
+        consts: List of non-primitive constant objects referenced by the
+            generated code via __vllm_consts__. None for legacy cached
+            code that predates this feature.
 
     Returns:
         A callable that executes the stitching logic.
@@ -169,6 +191,8 @@ def compile_execution_fn(
         payload_fn=lambda: code,
     )
     namespace: dict[str, Any] = {}
+    if consts is not None:
+        namespace["__vllm_consts__"] = consts
     exec(code, namespace)  # noqa: S102
     fn = namespace["execution_fn"]
     # Using .get() is intentional here because only piecewise backend will
@@ -180,19 +204,32 @@ def compile_execution_fn(
     return partial(fn, __vllm_submods__=submods_list)
 
 
-def _node_ref(arg: Any) -> str:
-    """Convert an FX node argument to a source code reference recursively."""
+def _node_ref(arg: Any, consts: list[Any], const_index: dict[int, int]) -> str:
+    """Convert an FX node argument to a source code reference."""
     if isinstance(arg, torch.fx.Node):
         return arg.name
     if isinstance(arg, list):
-        return f"[{', '.join(_node_ref(x) for x in arg)}]"
+        return f"[{', '.join(_node_ref(x, consts, const_index) for x in arg)}]"
     if isinstance(arg, tuple):
-        items = ", ".join(_node_ref(x) for x in arg)
+        items = ", ".join(_node_ref(x, consts, const_index) for x in arg)
         return f"({items},)" if len(arg) == 1 else f"({items})"
     if isinstance(arg, dict):
         return (
             "{"
-            + ", ".join(f"{_node_ref(k)}: {_node_ref(v)}" for k, v in arg.items())
+            + ", ".join(
+                f"{_node_ref(k, consts, const_index)}: "
+                f"{_node_ref(v, consts, const_index)}"
+                for k, v in arg.items()
+            )
             + "}"
         )
-    return repr(arg)
+    if isinstance(arg, (int, float, bool, str, bytes, type(None))):
+        return repr(arg)
+    # Dedup by identity, not equality: safe because FX graph args
+    # are live for the entire code-generation pass. Objects stored
+    # here must be picklable (for compile-artifact caching).
+    key = id(arg)
+    if key not in const_index:
+        const_index[key] = len(consts)
+        consts.append(arg)
+    return f"__vllm_consts__[{const_index[key]}]"
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 933554faa280..2348ff3191b7 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -141,6 +141,10 @@ class AlwaysHitShapeEnv:
 
     def __init__(self) -> None:
         self.guards: list[Any] = []
+        # Read by torch._inductor.codecache.FxGraphHashDetails (torch>=2.11)
+        # to incorporate user-provided dynamic-shape hint overrides into the
+        # cache key. We never override hints, so an empty dict is correct.
+        self.var_to_hint_override: dict[Any, int] = {}
 
     def evaluate_guards_expression(self, *args: Any, **kwargs: Any) -> Literal[True]:
         return True
diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index e683b1dfa69f..83423c02a6f1 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -40,7 +40,7 @@
     VllmPatternMatcherPass,
     VllmPatternReplacement,
 )
-from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8
+from .matcher_utils import MatcherQuantFP8
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
@@ -356,10 +356,11 @@ def __init__(
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, residual, weight = self.rmsnorm_matcher.inputs()
+        input = self.empty(5, 16)
+        residual = self.empty(5, 16)
+        weight = self.empty(16)
 
         # input goes through allreduce first, always 16-bit
         return [residual, input.to(self.dtype), weight]
@@ -369,7 +370,9 @@ def pattern(
             residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             return rms, residual
 
         def replacement(
@@ -503,11 +506,12 @@ def __init__(
         self.allreduce_params = allreduce_params
         self.quant_dtype = torch.float8_e4m3fn
 
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, residual, weight = self.rmsnorm_matcher.inputs()
+        input = self.empty(5, 16)
+        residual = self.empty(5, 16)
+        weight = self.empty(16)
         _, scale = self.quant_matcher.inputs()
 
         # input goes through allreduce first, always 16-bit
@@ -521,7 +525,9 @@ def pattern(
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, res = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, res = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             quant, _ = self.quant_matcher(rms, scale)
 
             return quant, res
@@ -668,7 +674,6 @@ def __init__(
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.allreduce_params = allreduce_params
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
         input = torch.empty([16, 16], device=self.device, dtype=self.dtype)
@@ -700,7 +705,9 @@ def pattern(
             input_global_scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             quant_out_tuple = auto_functionalized(
                 STATIC_FP4_QUANT_OP,
                 input=rms,
@@ -932,7 +939,7 @@ def replacement(self):
         def _replacement(
             input: torch.Tensor, weight: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            residual = torch.empty_like(input)
+            residual = torch.zeros_like(input)
             allreduce = self.FUSED_AR_RMSNORM_OP(
                 input_=input,
                 residual=residual,
@@ -955,15 +962,11 @@ def __init__(
         super().__init__(dtype, device)
         self.epsilon = epsilon
         self.dtype = dtype
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(
-            epsilon, match_rocm_aiter=use_aiter_rmsnorm
-        )
         self.FUSED_AR_RMSNORM_OP = rocm_aiter_ops.get_fused_allreduce_rmsnorm_op()
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, residual, weight = self.rmsnorm_matcher.inputs()
-
-        return [residual, input.to(self.dtype), weight]
+        # input, residual, weight
+        return [self.empty(5, 16), self.empty(5, 16), self.empty(16)]
 
     @property
     def pattern(self):
@@ -971,8 +974,9 @@ def _pattern(
             residual: torch.Tensor, input: torch.Tensor, weight: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor]:
             allreduce_output = tensor_model_parallel_all_reduce(input)
-            rms, residual = self.rmsnorm_matcher(allreduce_output, weight, residual)
-
+            rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                allreduce_output, residual, weight, self.epsilon
+            )
             return rms, residual
 
         return _pattern
diff --git a/vllm/compilation/passes/fusion/matcher_utils.py b/vllm/compilation/passes/fusion/matcher_utils.py
index c2490d8a21f5..e5130c19c392 100644
--- a/vllm/compilation/passes/fusion/matcher_utils.py
+++ b/vllm/compilation/passes/fusion/matcher_utils.py
@@ -10,7 +10,6 @@
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
@@ -159,72 +158,6 @@ def forward_native(
         return result
 
 
-class MatcherFusedAddRMSNorm(MatcherCustomOp):
-    def __init__(
-        self,
-        epsilon: float,
-        enabled: bool | None = None,
-        match_rocm_aiter: bool = False,
-    ) -> None:
-        if enabled is None:
-            enabled = RMSNorm.enabled()
-
-        super().__init__(enabled)
-        self.epsilon = epsilon
-        self.match_rocm_aiter = match_rocm_aiter
-
-        self._rmsnorm_op = RMS_ADD_OP
-
-        if match_rocm_aiter:
-            self._rmsnorm_op = rocm_aiter_ops.get_rmsnorm_fused_add_op()
-
-    def inputs(self) -> list[torch.Tensor]:
-        input = self.empty(5, 16) if self.enabled else self.empty_f32(5, 16)
-        weight = self.empty(16)
-        residual = self.empty(5, 16)
-        return [input, weight, residual]
-
-    def forward_rocm_aiter(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self._rmsnorm_op(  # type: ignore[no-any-return]
-            x=input, residual=residual, weight=weight, variance_epsilon=self.epsilon
-        )
-
-    def forward_custom(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self.match_rocm_aiter:
-            return self.forward_rocm_aiter(input, weight, residual)
-
-        _, result, residual = auto_functionalized(
-            self._rmsnorm_op,
-            input=input,
-            residual=residual,
-            weight=weight,
-            epsilon=self.epsilon,
-        )
-
-        return result, residual
-
-    def forward_native(
-        self,
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        residual: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        result: tuple[torch.Tensor, torch.Tensor] = RMSNorm.forward_static(
-            input, self.epsilon, input.size(-1), self.model_dtype, weight, residual
-        )
-        return result
-
-
 class MatcherQuantFP8(MatcherCustomOp):
     def __init__(
         self,
diff --git a/vllm/compilation/passes/fusion/rms_quant_fusion.py b/vllm/compilation/passes/fusion/rms_quant_fusion.py
index 850e434a3e73..cc986595d436 100644
--- a/vllm/compilation/passes/fusion/rms_quant_fusion.py
+++ b/vllm/compilation/passes/fusion/rms_quant_fusion.py
@@ -29,7 +29,6 @@
 from ..inductor_pass import enable_fake_mode
 from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 from .matcher_utils import (
-    MatcherFusedAddRMSNorm,
     MatcherQuantFP8,
 )
 
@@ -146,9 +145,6 @@ def __init__(
         assert key in FUSED_OPS, f"unsupported fused rmsnorm+quant op for {key}"
         self.FUSED_OP = FUSED_OPS[key]
 
-        if key.fused_add:
-            self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
-
         self.quant_matcher = MatcherQuantFP8(
             key.quant,
             has_col_major_scales=has_col_major_scales,
@@ -231,7 +227,9 @@ def pattern(
             residual: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, _ = self.quant_matcher(result_rms, scale)
 
             return result, residual
@@ -261,8 +259,9 @@ def replacement(
             return at[1], at[2]
 
         inputs = [
-            # input, weight, residual
-            *self.rmsnorm_matcher.inputs(),
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
             self.quant_matcher.inputs()[1],  # scale
         ]
 
@@ -311,7 +310,9 @@ def pattern(
             residual: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result = torch.empty(
                 result_rms.shape,
                 device=result_rms.device,
@@ -366,12 +367,17 @@ def replacement(
             # result, residual, scale
             return at[1], at[3], at[2]
 
-        scale = self.quant_matcher.empty_f32(1, 1)
+        inputs = [
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
+            self.quant_matcher.empty_f32(1, 1),  # scale
+        ]
 
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs() + [scale],
+            inputs,
             pm.fwd_only,
             pm_pass,
             extra_check=_rms_input_weight_dtype_match,
@@ -552,7 +558,9 @@ def register(self, pm_pass: PatternMatcherPass) -> None:
         def pattern(
             input: torch.Tensor, weight: torch.Tensor, residual: torch.Tensor
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual = vllm.ir.ops.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual, scale
@@ -580,10 +588,16 @@ def replacement(
             # result, residual, scale
             return at[1], at[3], at[2]
 
+        inputs = [
+            empty_bf16(5, 16),  # input
+            empty_bf16(16),  # weight
+            empty_bf16(5, 16),  # residual
+        ]
+
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs(),
+            inputs,
             pm.fwd_only,
             pm_pass,
             extra_check=_rms_input_weight_dtype_match,
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index cdd0e23773d6..83abcdeb5abd 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -29,7 +29,6 @@
     VllmPatternReplacement,
 )
 from .matcher_utils import (
-    MatcherFusedAddRMSNorm,
     MatcherQuantFP8,
     MatcherSiluAndMul,
 )
@@ -49,10 +48,6 @@ def __init__(
         self.quant_dtype = key.quant.dtype
         self.device = torch.device("cuda")
 
-        if key.fused_add:
-            self.rmsnorm_matcher = MatcherFusedAddRMSNorm(
-                epsilon, match_rocm_aiter=True
-            )
         self.quant_matcher = MatcherQuantFP8(
             key.quant,
             match_rocm_aiter=match_aiter_quant,
@@ -145,7 +140,9 @@ def pattern(
             weight: torch.Tensor,
             residual: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual_out, scale
@@ -163,10 +160,16 @@ def replacement(
 
             return result[0], result[1], result[2]
 
+        inputs = [
+            self.empty(5, 16),  # input
+            self.empty(16),  # weight
+            self.empty(5, 16),  # residual
+        ]
+
         pm.register_replacement(
             pattern,
             replacement,
-            self.rmsnorm_matcher.inputs(),
+            inputs,
             pm.fwd_only,
             pm_pass,
         )
@@ -258,7 +261,9 @@ def pattern(
             weight: torch.Tensor,
             residual: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             result, scale = self.quant_matcher(result_rms)
 
             return result, residual_out, scale
@@ -279,9 +284,13 @@ def replacement(
             # result, scale, residual
             return at[0], at[1], at[2]
 
-        pm.register_replacement(
-            pattern, replacement, self.rmsnorm_matcher.inputs(), pm.fwd_only, pm_pass
-        )
+        inputs = [
+            self.empty(5, 16),  # input
+            self.empty(16),  # weight
+            self.empty(5, 16),  # residual
+        ]
+
+        pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass)
 
 
 class RocmAiterRMSNormQuantFusionPass(VllmPatternMatcherPass):
@@ -312,7 +321,18 @@ def __init__(self, config: VllmConfig) -> None:
                 epsilon, FP8_DTYPE, GroupShape(1, 128)
             ).register(self.patterns)
 
-            for match_aiter_quant in [True, False]:
+            # When quant_fp8 custom ops are disabled, both AITER and native
+            # quant matchers trace through QuantFP8's native implementation.
+            # Registering both variants would create duplicate Inductor
+            # patterns.
+            is_quant_fp8_enabled = config.compilation_config.is_custom_op_enabled(
+                "quant_fp8"
+            )
+            match_aiter_quant_options = (
+                [True, False] if is_quant_fp8_enabled else [False]
+            )
+
+            for match_aiter_quant in match_aiter_quant_options:
                 # Fuse aiter rms_norm + (aiter / vllm built-in)
                 # dynamic per-token fp8 quant
                 AiterRMSNormDynamicQuantPattern(
@@ -420,12 +440,15 @@ def __init__(
         self.epsilon = epsilon
         self.hidden_size = hidden_size
         self.x_pad_to_multiple = x_pad_to_multiple
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon, match_rocm_aiter=True)
 
     def get_inputs(self) -> list[torch.Tensor]:
-        input, weight, residual = self.rmsnorm_matcher.inputs()
-        router_weight = torch.empty([8, 16], dtype=weight.dtype, device=weight.device)
-        router_bias = torch.empty([8], dtype=weight.dtype, device=weight.device)
+        device = torch.device("cuda")
+        dtype = torch.bfloat16
+        input = torch.empty(5, 16, dtype=dtype, device=device)
+        weight = torch.empty(16, dtype=dtype, device=device)
+        residual = torch.empty(5, 16, dtype=dtype, device=device)
+        router_weight = torch.empty([8, 16], dtype=dtype, device=device)
+        router_bias = torch.empty([8], dtype=dtype, device=device)
         return [input, weight, residual, router_weight, router_bias]
 
     def register(self, pm_pass: PatternMatcherPass) -> None:
@@ -439,7 +462,9 @@ def pattern(
             pad_size = self.x_pad_to_multiple - (
                 self.hidden_size % self.x_pad_to_multiple
             )
-            result_rms, residual_out = self.rmsnorm_matcher(input, weight, residual)
+            result_rms, residual_out = torch.ops.vllm_ir.fused_add_rms_norm(
+                input, residual, weight, self.epsilon
+            )
             router_logits = torch.ops.vllm.rocm_unquantized_gemm(
                 result_rms, router_weight, router_bias
             )
diff --git a/vllm/compilation/passes/fusion/sequence_parallelism.py b/vllm/compilation/passes/fusion/sequence_parallelism.py
index 35885eeb0b8e..2c7a1390bdb8 100644
--- a/vllm/compilation/passes/fusion/sequence_parallelism.py
+++ b/vllm/compilation/passes/fusion/sequence_parallelism.py
@@ -23,7 +23,7 @@
 from ..inductor_pass import enable_fake_mode
 from ..utility.noop_elimination import NoOpEliminationPass
 from ..vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
-from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8
+from .matcher_utils import MatcherQuantFP8
 
 logger = init_logger(__name__)
 
@@ -174,7 +174,6 @@ def replacement(
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
     def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
 
     def get_inputs(self) -> list[torch.Tensor]:
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
@@ -195,7 +194,9 @@ def pattern(
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            rmsnorm = vllm.ir.ops.fused_add_rms_norm(
+                all_reduce, residual, rms_norm_weights, self.epsilon
+            )
             return rmsnorm[0], rmsnorm[1]
 
         def replacement(
@@ -208,7 +209,9 @@ def replacement(
             # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
             residual = residual[0 : reduce_scatter.size(0), ...]
-            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            rmsnorm = vllm.ir.ops.fused_add_rms_norm(
+                reduce_scatter, residual, rms_norm_weights, self.epsilon
+            )
             all_gather = self._all_gather(rmsnorm[0])
             # shape of residual changes but that's fine,
             # next node is already slicing it, now becomes a noop
@@ -271,7 +274,6 @@ def replacement(
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
     def __init__(self, epsilon: float, dtype: torch.dtype, device: str | None) -> None:
         super().__init__(epsilon, dtype, device)
-        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
         self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self) -> list[torch.Tensor]:
@@ -290,8 +292,8 @@ def pattern(
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rms, residual_out = self.rmsnorm_matcher(
-                all_reduce, rms_norm_weights, residual
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                all_reduce, residual, rms_norm_weights, self.epsilon
             )
             quant, _ = self.quant_matcher(rms, scale)
             return quant, residual_out
@@ -308,8 +310,8 @@ def replacement(
             # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
             residual = residual[0 : reduce_scatter.size(0), ...]
-            rms, residual_out = self.rmsnorm_matcher(
-                reduce_scatter, rms_norm_weights, residual
+            rms, residual_out = vllm.ir.ops.fused_add_rms_norm(
+                reduce_scatter, residual, rms_norm_weights, self.epsilon
             )
             quant, _ = self.quant_matcher(rms, scale)
             all_gather = self._all_gather(quant)
diff --git a/vllm/compilation/passes/inductor_pass.py b/vllm/compilation/passes/inductor_pass.py
index b54c7bfa14d0..8a0d5326dd92 100644
--- a/vllm/compilation/passes/inductor_pass.py
+++ b/vllm/compilation/passes/inductor_pass.py
@@ -30,6 +30,9 @@ class PassContext:
     def __init__(self, compile_range: Range):
         self.compile_range: Range = compile_range
 
+        # set of arg indices
+        self.donated_input_ids: set[int] = set()
+
 
 def get_pass_context() -> PassContext:
     """Get the current pass context."""
diff --git a/vllm/compilation/passes/ir/clone_elimination.py b/vllm/compilation/passes/ir/clone_elimination.py
new file mode 100644
index 000000000000..61ba750a6c4e
--- /dev/null
+++ b/vllm/compilation/passes/ir/clone_elimination.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from torch import fx
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._higher_order_ops.triton_kernel_wrap import TritonKernelWrapperFunctional
+from torch._ops import HigherOrderOperator, OpOverload
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from ..fx_utils import is_func
+from ..inductor_pass import get_pass_context
+from ..vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+def user_writes_to_node(user: fx.Node, node: fx.Node) -> bool:
+    if user.op == "output":
+        return False
+
+    if is_func(user, auto_functionalized):
+        # While autofunc writes to the node,
+        # this is a follow-up use we're not interested in.
+        # It is also guaranteed to be the final use,
+        # as auto_functionalized returns the tensor back for follow-up use.
+        return False
+    elif user.op == "call_function" and isinstance(user.target, HigherOrderOperator):
+        # By default, be conservative, assume this could be a write
+        # (except functional HOPs)
+        return not isinstance(user.target, TritonKernelWrapperFunctional)
+
+    assert isinstance(user.target, OpOverload), (
+        f"{node=} {user=} {user.op=} {user.target=}"
+    )
+    schema = user.target._schema
+    assert len(user.args) <= len(schema.arguments)
+    for i, arg in enumerate(user.args):
+        # Only interested in writes to node
+        if arg is not node:
+            continue
+
+        # If not a write, next arg could be
+        if schema.arguments[i].is_write:
+            return True
+
+    # No writes found
+    return False
+
+
+class UnsafeCloneEliminationPass(VllmInductorPass):
+    """
+    This pass removes clone nodes that are no longer needed after vLLM IR lowering.
+    It uses donated_input_ids to eliminate clones of donated graph inputs, preserving
+    contents of non-donated graph inputs.
+
+    It is "unsafe" because it does not (yet) take aliasing into account. Solving
+    aliasing is an open problem, so this pass intends to support known vLLM cases
+    and not guarantee soundness on general graphs. In the future, this pass will likely
+    support basic forms of aliasing to handle simple views (e.g. qkv -> q,k,v).
+    """
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        super().__init__(vllm_config)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        count = 0
+        node_to_idx = {node: i for i, node in enumerate(graph.nodes)}
+        pass_context = get_pass_context()
+        donated_input_ids = pass_context.donated_input_ids
+        logger.debug("Donated input ids: %s", donated_input_ids)
+
+        for node in graph.nodes:
+            if not is_func(node, torch.ops.aten.clone.default):
+                continue
+
+            original_node = node.args[0]
+            assert isinstance(original_node, fx.Node)
+
+            # Clone needs to be preserved if node is getting written to and
+            # the old value is used again.
+            # This could only happen if an inplace implementation was lowered.
+            # Then node (the clone) will have one write.
+            # TODO(luka) hopefully this can be removed once we lower functional graphs.
+            write_idxs = [
+                node_to_idx[u] for u in node.users if user_writes_to_node(u, node)
+            ]
+            assert len(write_idxs) in (0, 1)
+            if write_idxs:
+                # Check if a user of original_node occurs after a write
+                write_idx = write_idxs[0]
+                if any(
+                    node_to_idx[orig_user] > write_idx
+                    for orig_user in original_node.users
+                ):
+                    logger.debug(
+                        "Clone removal not possible, "
+                        "original_node=%s used after mutation on node=%s",
+                        original_node,
+                        node,
+                    )
+                    continue
+
+                # Check if a node is a (non-donated) graph input
+                if (
+                    original_node.op == "placeholder"
+                    and node_to_idx[original_node] not in donated_input_ids
+                ):
+                    logger.debug(
+                        "Graph input %s not donated, cannot eliminate its clone",
+                        original_node,
+                    )
+                    continue
+
+            logger.debug(
+                "Node %s is a redundant clone node of %s, removing it",
+                node,
+                original_node,
+            )
+            node.replace_all_uses_with(original_node)
+            graph.erase_node(node)
+            count += 1
+
+        logger.debug("CloneCleanupPass removed %d clone nodes", count)
diff --git a/vllm/compilation/passes/ir/inplace_functionalization.py b/vllm/compilation/passes/ir/inplace_functionalization.py
new file mode 100644
index 000000000000..e69351075bca
--- /dev/null
+++ b/vllm/compilation/passes/ir/inplace_functionalization.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections import defaultdict
+
+from torch import fx
+from torch._inductor.pattern_matcher import (
+    PatternMatcherPass,
+)
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+from ..inductor_pass import get_pass_context
+from ..vllm_inductor_pass import VllmInductorPass
+from .lowering_pass import get_ir_op
+from .utils import overload_or_default
+
+logger = init_logger(__name__)
+
+
+class VllmIRInplaceFunctionalizationPass(VllmInductorPass):
+    """
+    This pass functionalizes maybe_inplace vLLM IR ops to the default overload.
+    The maybe_inplace overloads have the same signature as the default overload
+    so the pass simply replaces the called overload.
+    That makes the graph properly functional.
+    The pass also validates that activations passed to maybe_inplace have no later
+    uses in the graph: they are donated to the maybe_inplace op call,
+    and their contents are not defined afterward.
+
+    This pass operates pre-AOTAutograd,
+    so it must handle non-normalized and non-functional IR.
+    """
+
+    def __init__(self, vllm_config: VllmConfig) -> None:
+        super().__init__(vllm_config)
+        self.patterns = PatternMatcherPass(self.pass_name)
+        self.functionalized_ops: dict[str, int] = defaultdict(lambda: 0)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph) -> None:
+        # clear at the beginning instead of end, so that tests can inspect
+        self.functionalized_ops.clear()
+        assert graph.owning_module is not None
+        node_to_idx = {node: i for i, node in enumerate(graph.nodes)}
+
+        # Pass donated input via vLLM's pass context
+        pass_context = get_pass_context()
+        pass_context.donated_input_ids = set[int]()
+
+        for node in graph.nodes:
+            if (ir_op := get_ir_op(node)) is None:
+                continue
+
+            op_overload = overload_or_default(node.target)
+            overload_name = op_overload._overloadname
+            if overload_name != "maybe_inplace":
+                assert overload_name == "default", (
+                    f"Found overload {overload_name} for op {ir_op.name}, "
+                    f"expected maybe_inplace or default"
+                )
+                continue
+
+            # must have maybe_inplace overload and allow_inplace
+            assert ir_op.allow_inplace and hasattr(ir_op, "maybe_inplace")
+
+            # Check that activation inputs are not used after this op
+            for arg_idx in ir_op.activation_indices:
+                arg = node.args[arg_idx]
+                assert isinstance(arg, fx.Node), "Activation inputs must be fx.Node"
+                for user in arg.users:
+                    if node_to_idx[user] > node_to_idx[node]:
+                        raise ValueError(
+                            f"Input {arg} to maybe_inplace node {node} "
+                            f"is used again after the node. "
+                            f"This is not allowed; activation inputs to maybe_inplace "
+                            f"ops are donated to the op, meaning their memory may be "
+                            f"recycled for outputs.\n\n"
+                            f"To preserve the inputs, use the default overload or "
+                            f"clone them manually beforehand."
+                        )
+
+                if arg.op == "placeholder":
+                    # Graph input that maybe_inplace might modify.
+                    # Mark it so downstream passes know it's donated.
+                    # TODO(luka) store in placeholder node meta once supported
+                    pass_context.donated_input_ids.add(node_to_idx[arg])
+
+            # Same signature, just replace the overload that's called.
+            node.target = ir_op.torch_op
+            self.functionalized_ops[ir_op.name] += 1
+
+        count = sum(self.functionalized_ops.values())
+        ops = ",".join(self.functionalized_ops.keys())
+        logger.debug("Donated input IDs: %s", pass_context.donated_input_ids)
+        logger.debug(
+            "%s functionalized %d vLLM IR nodes for op(s) %s",
+            self.pass_name,
+            count,
+            ops,
+        )
diff --git a/vllm/compilation/passes/ir/lowering_pass.py b/vllm/compilation/passes/ir/lowering_pass.py
index 02acdd1a298b..f34f1c64b76e 100644
--- a/vllm/compilation/passes/ir/lowering_pass.py
+++ b/vllm/compilation/passes/ir/lowering_pass.py
@@ -10,7 +10,6 @@
     PatternMatcherPass,
     register_graph_pattern,
 )
-from torch._ops import OpOverload, OpOverloadPacket
 
 from vllm.config import VllmConfig
 from vllm.ir.op import IrOp
@@ -18,41 +17,11 @@
 from vllm.logging_utils import lazy
 
 from ..vllm_inductor_pass import VllmInductorPass
+from .utils import get_ir_op
 
 logger = init_logger(__name__)
 
 
-def get_default_overload(op: OpOverload | OpOverloadPacket) -> OpOverload:
-    if isinstance(op, OpOverloadPacket):
-        return op.default
-    assert isinstance(op, OpOverload), "Expected an OpOverload or OpOverloadPacket"
-    return op
-
-
-def get_ir_op(node: fx.Node) -> IrOp | None:
-    if node.op != "call_function":
-        return None
-
-    if not isinstance(node.target, (OpOverload, OpOverloadPacket)):
-        return None
-
-    op_overload = get_default_overload(node.target)
-    if op_overload.namespace != "vllm_ir":
-        return None
-
-    op_name = op_overload._opname
-    if op_name not in IrOp.registry:
-        logger.warning(
-            "Unknown vLLM IR op %s, there's likely an issue with torch registration, "
-            "or a torch custom op was registered in the vllm_ir namespace by mistake.",
-            op_name,
-        )
-        return None
-
-    ir_op = IrOp.registry[op_name]
-    return ir_op
-
-
 class VllmIRLoweringPass(VllmInductorPass):
     """
     This pass lowers vLLM IR ops to their implementations the priority list.
@@ -76,7 +45,7 @@ def lower_matched_op(self, match: Match, *args, **kwargs):
 
         assert len(match.nodes) == 1, "Expected single node match"
         node = match.nodes[0]
-        ir_op = get_ir_op(node)
+        ir_op = get_ir_op(node)  # TODO is node.target always an overload?
         assert ir_op is not None, "Expected vLLM IR op"
         assert not node.kwargs  # I think there should never be kwargs here
 
@@ -86,13 +55,18 @@ def lower_matched_op(self, match: Match, *args, **kwargs):
         self.selected_impls[ir_op.name][node.name] = ir_op_impl.provider
 
         # replace_by_example wants node args, not the fake tensors
+        # use func_impl_fn to properly handle in-place implementations
         # TODO(luka): Use aot_export_module to get functionalized graph
         # TODO(luka): Cache the fx_replacement to avoid re-tracing the same impl
 
         # Defaults not present on node.args but required for replacement tracing
         bound_args = ir_op._py_signature.bind(*node.args)
         bound_args.apply_defaults()
-        match.replace_by_example(ir_op_impl.impl_fn, bound_args.args)
+        # It is not safe to run functional passes (like DCE) on the replacements
+        # as they might not be functional.
+        match.replace_by_example(
+            ir_op_impl.func_impl_fn, bound_args.args, run_functional_passes=False
+        )
 
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
@@ -136,7 +110,7 @@ def print_count(counts: dict[str, int]) -> str:
 
         if failed_nodes or failed_ops:
             logger.warning("Failed to lower vLLM IR ops: %s", ",".join(failed_ops))
-            logger.warning("Full node list: %s", failed_nodes)
+            logger.warning("Full node list: %s", ",".join(str(n) for n in failed_nodes))
 
     def uuid(self) -> str:
         """
diff --git a/vllm/compilation/passes/ir/utils.py b/vllm/compilation/passes/ir/utils.py
new file mode 100644
index 000000000000..50b4773ce523
--- /dev/null
+++ b/vllm/compilation/passes/ir/utils.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from torch import fx
+from torch._ops import OpOverload, OpOverloadPacket
+
+from vllm.ir.op import IrOp
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def overload_or_default(op: OpOverload | OpOverloadPacket) -> OpOverload:
+    if isinstance(op, OpOverloadPacket):
+        return op.default
+    assert isinstance(op, OpOverload), "Expected an OpOverload or OpOverloadPacket"
+    return op
+
+
+def get_ir_op(node: fx.Node) -> IrOp | None:
+    if node.op != "call_function":
+        return None
+
+    if not isinstance(node.target, (OpOverload, OpOverloadPacket)):
+        return None
+
+    op_overload = overload_or_default(node.target)
+    if op_overload.namespace != "vllm_ir":
+        return None
+
+    op_name = op_overload._opname
+    if op_name not in IrOp.registry:
+        logger.warning(
+            "Unknown vLLM IR op %s, there's likely an issue with torch registration, "
+            "or a torch custom op was registered in the vllm_ir namespace by mistake.",
+            op_name,
+        )
+        return None
+
+    ir_op = IrOp.registry[op_name]
+    return ir_op
diff --git a/vllm/compilation/passes/pass_manager.py b/vllm/compilation/passes/pass_manager.py
index 3dc0d7b096ba..5d4355a5b2b4 100644
--- a/vllm/compilation/passes/pass_manager.py
+++ b/vllm/compilation/passes/pass_manager.py
@@ -14,6 +14,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import set_env_var
 
+from .ir.clone_elimination import UnsafeCloneEliminationPass
 from .ir.lowering_pass import VllmIRLoweringPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
@@ -115,6 +116,8 @@ def __call__(self, graph: fx.Graph) -> None:
         # DCE handles mutating ops correctly as well.
         self.ir_lowering(graph)
         VllmInductorPass.dump_prefix += 1
+        self.clone_elimination(graph)
+        VllmInductorPass.dump_prefix += 1
 
         # clean up after lowering again
         self.post_cleanup(graph)
@@ -149,11 +152,12 @@ def configure(self, config: VllmConfig) -> None:
                 self.passes += [MiniMaxQKNormPass(config)]
 
             if self.pass_config.fuse_norm_quant:
-                self.passes += [RMSNormQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
                     self.passes += [
                         RocmAiterRMSNormQuantFusionPass(config),
                     ]
+                self.passes += [RMSNormQuantFusionPass(config)]
+
             if self.pass_config.fuse_act_quant:
                 self.passes += [ActivationQuantFusionPass(config)]
                 if rocm_aiter_ops.is_enabled():
@@ -179,6 +183,7 @@ def configure(self, config: VllmConfig) -> None:
                 self.passes += [QKNormRoPEFusionPass(config)]
 
             self.ir_lowering = VllmIRLoweringPass(config)
+            self.clone_elimination = UnsafeCloneEliminationPass(config)
             self.post_cleanup = PostCleanupPass(config)
             self.fix_functionalization = FixFunctionalizationPass(config)
 
@@ -200,6 +205,7 @@ def uuid(self) -> str:
 
         passes.append(self.post_cleanup.uuid())
         passes.append(self.ir_lowering.uuid())
+        passes.append(self.clone_elimination.uuid())
         passes.append(self.post_cleanup.uuid())
         passes.append(self.fix_functionalization.uuid())
 
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 0f02a92681c1..0ac55376903d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -1298,6 +1298,43 @@ def is_custom_op_enabled(self, op: str) -> bool:
         assert "none" in self.custom_ops
         return f"+{op}" in self.custom_ops
 
+    @staticmethod
+    def _has_request_constant_kv_pools(kv_cache_config: "KVCacheConfig") -> bool:
+        from vllm.v1.kv_cache_interface import MemoryModel
+
+        return any(
+            pool.memory_model == MemoryModel.REQUEST_CONSTANT
+            for pool in kv_cache_config.pool_configs
+        )
+
+    @staticmethod
+    def _validate_request_constant_cudagraph_capacity(
+        kv_cache_config: "KVCacheConfig",
+        max_num_reqs: int,
+    ) -> None:
+        from vllm.v1.kv_cache_interface import MemoryModel
+
+        for pool in kv_cache_config.pool_configs:
+            if pool.memory_model != MemoryModel.REQUEST_CONSTANT:
+                continue
+
+            blocks_per_request = 0
+            for group_id in pool.group_ids:
+                spec = kv_cache_config.kv_cache_groups[group_id].kv_cache_spec
+                blocks_per_request += spec.blocks_per_request
+            required_blocks = max_num_reqs * blocks_per_request
+            usable_blocks = pool.num_blocks - 1
+            if required_blocks <= usable_blocks:
+                continue
+
+            raise ValueError(
+                f"max_num_seqs ({max_num_reqs}) requires "
+                f"{required_blocks} REQUEST_CONSTANT KV cache blocks for "
+                f"pool {pool.pool_id}, but only {usable_blocks} are available. "
+                "Full CUDA graph capture cannot proceed. Please lower "
+                "max_num_seqs or increase gpu_memory_utilization."
+            )
+
     def resolve_cudagraph_mode_and_sizes(
         self,
         min_cg_support: "AttentionCGSupport",
@@ -1417,6 +1454,26 @@ def resolve_cudagraph_mode_and_sizes(
                 tensor_parallel_size,
             )
 
+        has_request_constant_pools = (
+            kv_cache_config is not None
+            and self._has_request_constant_kv_pools(kv_cache_config)
+        )
+
+        if (
+            kv_cache_config is not None
+            and cudagraph_mode.has_full_cudagraphs()
+            and not is_profiling
+            and has_request_constant_pools
+        ):
+            if max_num_reqs is None:
+                raise ValueError(
+                    "Full CUDA graph capture with REQUEST_CONSTANT KV cache "
+                    "requires max_num_seqs for capacity validation."
+                )
+            self._validate_request_constant_cudagraph_capacity(
+                kv_cache_config, max_num_reqs
+            )
+
         # For Mamba models with FULL decode cudagraphs, each decode
         # sequence needs one Mamba cache block. The decode cudagraph
         # dispatcher already caps batch sizes at max_num_seqs, so we just
@@ -1430,6 +1487,7 @@ def resolve_cudagraph_mode_and_sizes(
             and cudagraph_mode.has_full_cudagraphs()
             and not is_profiling
             and kv_cache_config.has_mamba_layers
+            and not has_request_constant_pools
             and max_num_reqs > kv_cache_config.num_blocks
         ):
             raise ValueError(
diff --git a/vllm/config/kernel.py b/vllm/config/kernel.py
index 93fb4c54b7f1..ec9c87852513 100644
--- a/vllm/config/kernel.py
+++ b/vllm/config/kernel.py
@@ -31,6 +31,9 @@ class IrOpPriorityConfig:
     rms_norm: list[str] = Field(default_factory=list)
     """Priority list for vllm.ir.ops.rms_norm"""
 
+    fused_add_rms_norm: list[str] = Field(default_factory=list)
+    """Priority list for vllm.ir.ops.fused_add_rms_norm"""
+
     def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.
@@ -115,6 +118,8 @@ def with_default(
     "flashinfer_cutlass",
     "flashinfer_cutedsl",
     "marlin",
+    "humming",
+    "triton_unfused",
     "aiter",
     "emulation",
 ]
@@ -137,7 +142,7 @@ class KernelConfig:
     """Backend for MoE expert computation kernels. Available options:
 
     - "auto": Automatically select the best backend based on model and hardware
-    - "triton": Use Triton-based fused MoE kernels 
+    - "triton": Use Triton-based fused MoE kernels
     - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
     - "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
     - "cutlass": Use vLLM CUTLASS kernels
@@ -145,6 +150,8 @@ class KernelConfig:
     - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
     - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
     - "marlin": Use Marlin kernels (weight-only quantization)
+    - "humming": Use Humming Mixed Precision kernels
+    - "triton_unfused": Use Triton unfused MoE kernels
     - "aiter": Use AMD AITer kernels (ROCm only)
     - "emulation": use BF16/FP16 GEMM, dequantizing weights and
                    running QDQ on activations.
diff --git a/vllm/config/model.py b/vllm/config/model.py
index 470e8091cc27..22a1f6a42981 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -83,7 +83,9 @@
 RunnerOption = Literal["auto", RunnerType]
 ConvertType = Literal["none", "embed", "classify"]
 ConvertOption = Literal["auto", ConvertType]
-TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4"]
+TokenizerMode = Literal[
+    "auto", "hf", "slow", "mistral", "deepseek_v32", "deepseek_v4", "fastokens"
+]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -136,6 +138,9 @@ class ModelConfig:
     - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
     - "deepseek_v4" will always use the tokenizer from `deepseek_v4`.
     - "qwen_vl" will always use the tokenizer from `qwen_vl`.
+    - "fastokens" loads a Hugging Face fast tokenizer powered by the
+      [fastokens](https://github.com/crusoecloud/fastokens) Rust BPE backend
+      (requires the `fastokens` package to be installed).
     - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 6ba392802e31..95fd8787afe5 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -135,8 +135,10 @@ class ParallelConfig:
     data_parallel_external_lb: bool = False
     """Whether to use "external" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
-    wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
-    is provided explicitly to vllm serve."""
+    wide-EP setup in Kubernetes. Supported only for MoE deployments; non-MoE
+    models should use independent vLLM instances without --data-parallel-*
+    arguments. Set implicitly when --data-parallel-rank is provided explicitly
+    to vllm serve."""
     data_parallel_hybrid_lb: bool = False
     """Whether to use "hybrid" DP LB mode. Applies only to online serving
     and when data_parallel_size > 0. Enables running an AsyncLLM
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 8763e6a0525a..0f8adc6e4ac2 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -50,6 +50,7 @@
     "pangu_ultra_moe_mtp",
     "step3p5_mtp",
     "hy_v3_mtp",
+    "gemma4_mtp",
 ]
 NgramGPUTypes = Literal["ngram_gpu"]
 DFlashModelTypes = Literal["dflash"]
@@ -491,6 +492,17 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
                 {"n_predict": n_predict, "architectures": ["HYV3MTPModel"]}
             )
 
+        if hf_config.model_type == "gemma4_assistant":
+            hf_config.model_type = "gemma4_mtp"
+            text_config = getattr(hf_config, "text_config", hf_config)
+            # The assistant runs all decoder layers in a single forward
+            # call to produce one draft token, so n_predict=1.
+            # num_kv_shared_layers must be 0: cross-model KV sharing is
+            # set up by the proposer after model construction.
+            if hasattr(text_config, "num_kv_shared_layers"):
+                text_config.num_kv_shared_layers = 0
+            hf_config.update({"n_predict": 1, "architectures": ["Gemma4MTPModel"]})
+
         return hf_config
 
     def __post_init__(self):
@@ -626,6 +638,7 @@ def __post_init__(self):
                     revision=self.revision,
                     code_revision=self.code_revision,
                     tokenizer_revision=self.target_model_config.tokenizer_revision,
+                    max_model_len=self.max_model_len,  # type: ignore[arg-type]
                     spec_target_max_model_len=self.target_model_config.max_model_len,
                     quantization=self.quantization,
                     enforce_eager=self.target_model_config.enforce_eager,
@@ -837,10 +850,17 @@ def _maybe_override_draft_max_model_len(
 
             return speculative_max_model_len
 
-        return min(
+        result = min(
             draft_max_model_len,
             target_max_model_len,
         )
+        if result != draft_max_model_len:
+            logger.info(
+                "Overriding draft model max model len from %d to %d",
+                draft_max_model_len,
+                result,
+            )
+        return result
 
     @staticmethod
     def _verify_and_get_draft_tp(
@@ -983,6 +1003,7 @@ def _verify_args(self) -> Self:
             "kimi_k25",
             "minimax_m2",
             "gemma4",
+            "laguna",
         ]
         if (
             self.method in ("eagle3", "extract_hidden_states", "dflash")
@@ -1032,6 +1053,14 @@ def max_num_new_slots_for_drafting(self) -> int:
             slots_per_req += 1
         return slots_per_req
 
+    def use_gemma4_mtp(self) -> bool:
+        return (
+            self.method == "mtp"
+            and self.draft_model_config is not None
+            and getattr(self.draft_model_config.hf_config, "model_type", None)
+            == "gemma4_mtp"
+        )
+
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "mtp", "dflash")
 
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index 8d2c2608e56b..cc21a0c97399 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -125,9 +125,7 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
         from vllm._aiter_ops import rocm_aiter_ops
 
         return (
-            rocm_aiter_ops.is_enabled()
-            and rocm_aiter_ops.is_rmsnorm_enabled()
-            and cfg.parallel_config.tensor_parallel_size > 1
+            rocm_aiter_ops.is_enabled() and cfg.parallel_config.tensor_parallel_size > 1
         )
 
     return (
@@ -159,10 +157,9 @@ def enable_rope_kvcache_fusion(cfg: "VllmConfig") -> bool:
 
 def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool:
     """Enable if using AITER RMSNorm and hidden size is 2880 i.e. gpt-oss."""
-    from vllm._aiter_ops import rocm_aiter_ops
 
     return (
-        rocm_aiter_ops.is_rmsnorm_enabled()
+        cfg.kernel_config.ir_op_priority.fused_add_rms_norm[0] == "aiter"
         and cfg.model_config is not None
         and cfg.model_config.get_hidden_size() == 2880
     )
@@ -212,7 +209,9 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
         "use_inductor_graph_partition": False,
     },
     "kernel_config": {
-        "enable_flashinfer_autotune": True,
+        # Disabled for now due to correctness issues:
+        # https://github.com/flashinfer-ai/flashinfer/issues/3197
+        "enable_flashinfer_autotune": False,
     },
 }
 OPTIMIZATION_LEVEL_02 = {
@@ -232,7 +231,9 @@ def enable_mla_dual_rms_norm_fusion(cfg: "VllmConfig") -> bool:
         "use_inductor_graph_partition": False,
     },
     "kernel_config": {
-        "enable_flashinfer_autotune": True,
+        # Disabled for now due to correctness issues:
+        # https://github.com/flashinfer-ai/flashinfer/issues/3197
+        "enable_flashinfer_autotune": False,
     },
 }
 OPTIMIZATION_LEVEL_03 = {
@@ -1156,6 +1157,12 @@ def has_blocked_weights():
         if envs.VLLM_USE_V2_MODEL_RUNNER:
             self._validate_v2_model_runner()
 
+        if (
+            self.model_config is not None
+            and self.model_config.enable_return_routed_experts
+        ):
+            self._validate_return_routed_experts()
+
         # Re-compute compile ranges after platform-specific config updates
         # (e.g., XPU may lower max_num_batched_tokens when MLA is enabled)
         self._set_compile_ranges()
@@ -1616,7 +1623,7 @@ def _set_compile_ranges(self):
                 max_size = rocm_aiter_ops.get_aiter_allreduce_max_size()
             else:
                 max_size = compilation_config.pass_config.flashinfer_max_size(tp_size)
-            if max_size is not None:
+            if max_size is not None and self.model_config is not None:
                 assert isinstance(self.model_config.dtype, torch.dtype)
                 max_token_num = max_size // (
                     self.model_config.get_hidden_size()
@@ -1845,6 +1852,51 @@ def _validate_v2_model_runner(self) -> None:
                 + ", ".join(unsupported)
             )
 
+    def _validate_return_routed_experts(self) -> None:
+        """Reject parallelism configurations not yet validated with
+        --enable-return-routed-experts.
+
+        Validated scope (PR #39917): TP, EP, DP, single-node and multi-node,
+        prefix caching, and speculative decoding (MTP validated end-to-end;
+        Eagle/Eagle3/Ngram/Medusa supported by construction since the
+        routing buffer is bound only to the target model and verified-token
+        routing lands at the correct positions during the main forward).
+
+        Out-of-scope (block until validated): PP > 1, prefill context
+        parallelism (PCP) > 1, decode context parallelism (DCP) > 1,
+        async scheduling.
+        """
+        unsupported: list[str] = []
+
+        if self.parallel_config.pipeline_parallel_size > 1:
+            unsupported.append(
+                "pipeline parallelism "
+                f"(pipeline_parallel_size="
+                f"{self.parallel_config.pipeline_parallel_size})"
+            )
+        if self.parallel_config.prefill_context_parallel_size > 1:
+            unsupported.append(
+                "prefill context parallelism "
+                f"(prefill_context_parallel_size="
+                f"{self.parallel_config.prefill_context_parallel_size})"
+            )
+        if self.parallel_config.decode_context_parallel_size > 1:
+            unsupported.append(
+                "decode context parallelism "
+                f"(decode_context_parallel_size="
+                f"{self.parallel_config.decode_context_parallel_size})"
+            )
+        if self.scheduler_config.async_scheduling:
+            unsupported.append("async scheduling")
+
+        if unsupported:
+            raise ValueError(
+                "--enable-return-routed-experts is not yet validated with: "
+                + ", ".join(unsupported)
+                + ". Disable these features or omit "
+                "--enable-return-routed-experts."
+            )
+
     def validate_block_size(self) -> None:
         """Validate block_size against DCP and mamba constraints.
 
@@ -1891,6 +1943,10 @@ def validate_block_size(self) -> None:
                 "to schedule a multiple of block_size tokens even if they are "
                 "in the middle of a mm input"
             )
+            # TODO: support align mamba cache mode for model runner v2
+            assert not envs.VLLM_USE_V2_MODEL_RUNNER, (
+                "Model Runner V2 has not yet supported mamba_cache_mode='align'. "
+            )
 
     @model_validator(mode="after")
     def validate_nvfp4_kv_cache_with_mla(self) -> "VllmConfig":
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index 29c731fbc606..209005c5b10a 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -42,7 +42,7 @@ def __init__(
                 logger.info("Using AgRs manager on XPU device.")
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        output = input_.clone() if torch.compiler.is_compiling() else input_
+        output = input_.clone()
         dist.all_reduce(output, group=self.device_group)
         return output
 
diff --git a/vllm/distributed/eplb/eplb_communicator.py b/vllm/distributed/eplb/eplb_communicator.py
index ffd0406f76f5..95a5ae5ff45d 100644
--- a/vllm/distributed/eplb/eplb_communicator.py
+++ b/vllm/distributed/eplb/eplb_communicator.py
@@ -23,10 +23,6 @@
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     ncclDataTypeEnum,
 )
-from vllm.distributed.nixl_utils import (
-    NixlWrapper,
-    nixl_agent_config,
-)
 from vllm.distributed.parallel_state import (
     GroupCoordinator,
     get_pp_group,
@@ -41,6 +37,8 @@
 
 def has_nixl() -> bool:
     """Whether the optional NIXL / RIXL package is available."""
+    from vllm.distributed.nixl_utils import NixlWrapper
+
     return NixlWrapper is not None
 
 
@@ -235,6 +233,8 @@ def __init__(
         expert_weights: Sequence[torch.Tensor],
         cuda_stream: torch.cuda.Stream | None = None,
     ) -> None:
+        from vllm.distributed.nixl_utils import NixlWrapper, nixl_agent_config
+
         assert expert_weights, "NixlEplbCommunicator requires non-empty expert_weights."
         if NixlWrapper is None:
             raise RuntimeError("NIXL/ RIXL is unavailable.")
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index 39377aabcce3..64be074cf9f8 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -22,7 +22,7 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 ## Disaggregated prefilling
 
-The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
+The example usage is in [this file](../../../examples/disaggregated/disaggregated_prefill.sh).
 
 Here is the diagram of how we run disaggregated prefilling.
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py
index 63b56eddfaed..b85416ab3071 100644
--- a/vllm/distributed/kv_transfer/kv_connector/utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/utils.py
@@ -386,38 +386,6 @@ class EngineTransferInfo:
     """Physical blocks per logical block."""
 
 
-@dataclass(frozen=True)
-class MambaEngineTransferInfo(EngineTransferInfo):
-    """Extends ``EngineTransferInfo`` with Mamba-hybrid transfer geometry.
-
-    For hybrid SSM+Attention models, FA and Mamba layers may require
-    different numbers of reads from different remote ranks.  This
-    dataclass captures that per-engine transfer plan.
-    """
-
-    remote_fa_source_ranks: tuple[int, ...]
-    """Remote ranks carrying unique FA heads for this local rank."""
-
-    remote_all_source_ranks: tuple[int, ...]
-    """All remote ranks this local rank reads from (FA + Mamba)."""
-
-    remote_num_fa_reads: int
-    """Number of distinct remote ranks needed for FA data."""
-
-    remote_num_mamba_reads: int
-    """Number of distinct remote ranks needed for Mamba data."""
-
-    remote_fa_descriptor_bytes: int
-    """Byte size of one FA K (or V) descriptor entry."""
-
-    is_remote_replicated: bool
-    """Whether the remote engine has replicated KV heads
-    (remote_tp_size > total_num_kv_heads)."""
-
-    remote_physical_heads: int
-    """Physical KV heads stored per remote rank."""
-
-
 # ---- Transfer topology ----
 
 
@@ -439,8 +407,6 @@ def __post_init__(self):
         self.local_physical_heads = max(1, self.total_num_kv_heads // self.tp_size)
 
         self._engines: dict[EngineId, EngineTransferInfo] = {}
-        self._fa_source_sets: dict[EngineId, frozenset[int]] = {}
-        self._fa_source_indices: dict[EngineId, dict[int, int]] = {}
 
         # Figure out whether the first dimension of the cache is K/V
         # or num_blocks.
@@ -487,24 +453,12 @@ def __post_init__(self):
     def register_remote_engine(
         self,
         remote_engine_id: EngineId,
-        remote_tp_size: int,
-        remote_block_size: int,
-        remote_block_len: int,
-        remote_physical_blocks_per_logical: int,
-        *,
-        local_block_len: int = 0,
+        info: EngineTransferInfo,
     ) -> EngineTransferInfo:
         """Register a remote engine, unifying worker dicts state.
 
-        Only remote engines should be registered here — the local engine's
-        identity (tp_size, block_size, etc.) is set via ``__init__`` params.
-
-        For Mamba models, also computes the Mamba transfer plan and
-        builds the FA source lookup caches.
-
-        Args:
-            local_block_len: Local representative block_len (bytes).
-                Required for Mamba models to compute ``fa_descriptor_bytes``.
+        The caller (worker) is responsible for computing the info via
+        the transfer policy.  This method only stores and deduplicates.
         """
         assert remote_engine_id != self.engine_id, (
             f"Cannot register local engine {self.engine_id} as remote. "
@@ -512,29 +466,6 @@ def register_remote_engine(
         )
         if remote_engine_id in self._engines:
             return self._engines[remote_engine_id]
-        info: EngineTransferInfo
-        if self.is_mamba:
-            info = self._build_mamba_info(
-                remote_tp_size=remote_tp_size,
-                remote_block_size=remote_block_size,
-                remote_block_len=remote_block_len,
-                remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-                local_block_len=local_block_len,
-            )
-            assert isinstance(info, MambaEngineTransferInfo)
-            self._fa_source_sets[remote_engine_id] = frozenset(
-                info.remote_fa_source_ranks
-            )
-            self._fa_source_indices[remote_engine_id] = {
-                r: i for i, r in enumerate(info.remote_fa_source_ranks)
-            }
-        else:
-            info = EngineTransferInfo(
-                remote_tp_size=remote_tp_size,
-                remote_block_len=remote_block_len,
-                remote_block_size=remote_block_size,
-                remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-            )
         self._engines[remote_engine_id] = info
         return info
 
@@ -622,14 +553,8 @@ def target_remote_ranks(self, remote_engine_id: EngineId) -> list[int]:
         """Get the remote TP rank(s) that the current local TP rank will
         read from.  When remote tp_size > local tp_size, reads from
         multiple remote ranks.
-
-        For Mamba models, returns the precomputed ``all_source_ranks``
-        (FA + Mamba union).
         """
         info = self._engines[remote_engine_id]
-        if isinstance(info, MambaEngineTransferInfo):
-            return list(info.remote_all_source_ranks)
-
         tp_ratio = self.tp_ratio(info.remote_tp_size)
         if tp_ratio > 0:
             return [self.tp_rank // tp_ratio]
@@ -662,309 +587,15 @@ def get_transfer_cache_regions(
         # Regular case: backends like FA register K/V in separate regions
         return cache if self.split_k_and_v else [cache]
 
-    # ============================================================
-    # Mamba-specific methods
-    # ============================================================
-
-    def should_skip_fa(self, remote_engine_id: EngineId, remote_rank: int) -> bool:
-        """Whether to skip FA groups for this remote rank (mamba-only)."""
-        return remote_rank not in self._fa_source_sets[remote_engine_id]
-
-    def fa_head_slot(self, remote_engine_id: EngineId, remote_rank: int) -> int:
-        """Index into local FA block for this remote rank's head data.
-
-        For remote ranks in ``fa_source_ranks``, returns 0, 1, …, reads-1.
-        For ranks NOT in ``fa_source_ranks`` (replicated duplicates),
-        returns the slot of the matching source rank with the same head.
-        """
-        fa_index = self._fa_source_indices[remote_engine_id]
-        if remote_rank in fa_index:
-            return fa_index[remote_rank]
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        K = self.total_num_kv_heads
-        remote_tp = mamba_info.remote_tp_size
-        r_head = self._physical_head_range(remote_tp, K, remote_rank)
-        for target in mamba_info.remote_fa_source_ranks:
-            t_head = self._physical_head_range(remote_tp, K, target)
-            if self._range_overlap(r_head, t_head):
-                return fa_index[target]
-        return 0
-
-    def fa_rank_offset(
-        self, remote_engine_id: EngineId, remote_kv_block_len: int
-    ) -> int:
-        """Byte offset into remote FA block for this local rank.
-
-        When local TP is replicated (local_tp > K), multiple local ranks
-        share a head.  Computes offset *relative to the target remote
-        rank's first head* so it works regardless of how many heads the
-        remote has.  Returns 0 when local does not index into remote.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = self.tp_ratio(mamba_info.remote_tp_size)
-        if self.is_mla or tp_ratio <= 0:
-            return 0
-        K = self.total_num_kv_heads
-        is_local_replicated = self.tp_size > K
-        if is_local_replicated:
-            local_head = self.tp_rank * K // self.tp_size
-            p_rank = mamba_info.remote_fa_source_ranks[0]
-            p_start = p_rank * K // mamba_info.remote_tp_size
-            return (local_head - p_start) * remote_kv_block_len
-        return self.tp_rank % tp_ratio * remote_kv_block_len
-
-    def needs_split_handles(self, remote_engine_id: EngineId) -> bool:
-        """Whether per-remote-rank split handles are needed.
-
-        True when FA and mamba have different read counts, requiring
-        different splitting factors in the local handle.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = self.tp_ratio(mamba_info.remote_tp_size)
-        return (
-            tp_ratio < 0
-            and not self.is_mla
-            and len(mamba_info.remote_all_source_ranks) > 1
-        )
-
-    def compute_split_handle_data(
-        self,
-        remote_engine_id: EngineId,
-        src_blocks_data: list[tuple[int, int, int]],
-        num_fa_descs: int,
-        abs_tp: int,
-    ) -> list[list[tuple[int, int, int]]]:
-        """Per-remote-rank (addr, len, dev) triples for Mamba-HMA split
-        handles.
-
-        FA descriptors (indices < num_fa_descs) are sliced by
-        ``remote_num_fa_reads``; mamba descriptors are sliced uniformly
-        by ``abs_tp``.
-        """
-        mamba_info = self._engines[remote_engine_id]
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        all_handle_data: list[list[tuple[int, int, int]]] = []
-        for p_idx, p_rank in enumerate(mamba_info.remote_all_source_ranks):
-            handle_data: list[tuple[int, int, int]] = []
-            skip_fa = self.should_skip_fa(remote_engine_id, p_rank)
-            fa_slot = self.fa_head_slot(remote_engine_id, p_rank) if not skip_fa else 0
-            for j, (addr, local_len, dev) in enumerate(src_blocks_data):
-                if j < num_fa_descs:
-                    assert mamba_info.remote_num_fa_reads >= 1
-                    fa_chunk = local_len // mamba_info.remote_num_fa_reads
-                    handle_data.append((addr + fa_slot * fa_chunk, fa_chunk, dev))
-                else:
-                    mamba_chunk = local_len // abs_tp
-                    handle_data.append((addr + p_idx * mamba_chunk, mamba_chunk, dev))
-            all_handle_data.append(handle_data)
-        return all_handle_data
-
-    def filter_block_ids_for_rank(
-        self,
-        remote_engine_id: EngineId,
-        remote_rank: int,
-        local_ids: BlockIds,
-        remote_ids: BlockIds,
-        is_mamba_group: list[bool],
-    ) -> tuple[BlockIds, BlockIds]:
-        """Zero out FA groups for remote ranks outside ``fa_source_ranks``.
-
-        Returns (filtered_local_ids, filtered_remote_ids).  When the
-        remote rank carries FA data for this local rank, returns the
-        inputs unchanged.
-        """
-        if not self.should_skip_fa(remote_engine_id, remote_rank):
-            return local_ids, remote_ids
-        num_groups = len(local_ids)
-        filtered_local: list[list[int]] = [
-            [] if not is_mamba_group[g] else local_ids[g] for g in range(num_groups)
-        ]
-        filtered_remote: list[list[int]] = [
-            [] if not is_mamba_group[g] else remote_ids[g] for g in range(num_groups)
-        ]
-        return filtered_local, filtered_remote
-
     def describe(self, remote_engine_id: EngineId) -> str:
         """One-line summary of transfer config for logging."""
         info = self._engines[remote_engine_id]
-        base = (
+        return (
+            f"TransferTopology("
             f"tp_ratio={self.tp_ratio(info.remote_tp_size)}, "
             f"K={self.total_num_kv_heads}, "
             f"local_tp={self.tp_size}, "
             f"remote_tp={info.remote_tp_size}, "
             f"local_rank={self.tp_rank}, "
-            f"remote_block_len={info.remote_block_len}"
-        )
-        if isinstance(info, MambaEngineTransferInfo):
-            return (
-                f"TransferTopology.mamba({base}, "
-                f"fa_reads={info.remote_num_fa_reads}, "
-                f"mamba_reads={info.remote_num_mamba_reads}, "
-                f"fa_sources={list(info.remote_fa_source_ranks)}, "
-                f"all_sources={list(info.remote_all_source_ranks)}, "
-                f"fa_desc_bytes={info.remote_fa_descriptor_bytes})"
-            )
-        return f"TransferTopology({base})"
-
-    # ============================================================
-    # Private helpers
-    # ============================================================
-    # Mamba-HMA hetero-TP transfer config:
-    # With hetero-TP (P_TP > D_TP), FA KV cache may be replicated across
-    # P ranks (when P_TP > num_kv_heads), but Mamba conv/SSM state is
-    # almost always uniquely sharded per P rank.  So the number of P
-    # ranks D must read from can differ between FA and Mamba, and they
-    # must be handled separately.
-
-    @staticmethod
-    def _physical_head_range(tp_size: int, num_heads: int, rank: int) -> range:
-        """Physical KV head range stored in a rank's KV cache tensor.
-
-        When ``tp_size <= num_heads``: sharded, K/TP contiguous heads per rank.
-        When ``tp_size > num_heads``: 1 physical head per rank.  Heads are
-        distributed **contiguously** (matching vLLM's GQA weight partitioning):
-        consecutive ranks share a head before moving to the next one.
-        """
-        if tp_size <= num_heads:
-            assert num_heads % tp_size == 0
-            per_rank = num_heads // tp_size
-            return range(rank * per_rank, (rank + 1) * per_rank)
-        else:
-            h = rank * num_heads // tp_size
-            return range(h, h + 1)
-
-    @staticmethod
-    def _range_overlap(a: range, b: range) -> range:
-        start = max(a.start, b.start)
-        stop = min(a.stop, b.stop)
-        return range(start, max(start, stop))
-
-    # ============================================================
-    # Private: build Mamba transfer info
-    # ============================================================
-
-    def _build_mamba_info(
-        self,
-        remote_tp_size: int,
-        remote_block_size: int,
-        remote_block_len: int,
-        remote_physical_blocks_per_logical: int,
-        local_block_len: int,
-    ) -> MambaEngineTransferInfo:
-        """Compute Mamba transfer plan."""
-        K = self.total_num_kv_heads
-        local_tp = self.tp_size
-        local_rank = self.tp_rank
-
-        is_remote_replicated = remote_tp_size > K
-        remote_physical_heads = max(1, K // remote_tp_size)
-
-        if local_tp >= remote_tp_size:
-            assert local_tp % remote_tp_size == 0
-            tp_ratio = local_tp // remote_tp_size
-        else:
-            assert remote_tp_size % local_tp == 0
-            tp_ratio = -(remote_tp_size // local_tp)
-
-        abs_tp = -tp_ratio if tp_ratio < 0 else 1
-
-        mamba_range: range | None = None
-        if tp_ratio < 0:
-            mamba_range = range(local_rank * abs_tp, (local_rank + 1) * abs_tp)
-
-        # ---- FA read targets ----
-        if self.is_mla or tp_ratio >= 0:
-            num_fa_reads = 1
-            fa_source_ranks: list[int] = (
-                [0]
-                if self.is_mla
-                else [local_rank // tp_ratio if tp_ratio > 0 else local_rank]
-            )
-        else:
-            local_needs = self._physical_head_range(local_tp, K, local_rank)
-            search_range = (
-                mamba_range if mamba_range is not None else range(remote_tp_size)
-            )
-            seen: set[tuple[int, int]] = set()
-            fa_source_ranks = []
-            for p in search_range:
-                p_has = self._physical_head_range(remote_tp_size, K, p)
-                ov = self._range_overlap(local_needs, p_has)
-                if len(ov) > 0:
-                    key = (ov.start, ov.stop)
-                    if key not in seen:
-                        seen.add(key)
-                        fa_source_ranks.append(p)
-            if not fa_source_ranks:
-                for p in range(remote_tp_size):
-                    p_has = self._physical_head_range(remote_tp_size, K, p)
-                    ov = self._range_overlap(local_needs, p_has)
-                    if len(ov) > 0:
-                        key = (ov.start, ov.stop)
-                        if key not in seen:
-                            seen.add(key)
-                            fa_source_ranks.append(p)
-            num_fa_reads = len(fa_source_ranks)
-
-        # ---- All source ranks (mamba + FA) ----
-        if mamba_range is not None and abs_tp > num_fa_reads:
-            num_mamba_reads = abs_tp
-            all_source_ranks = list(mamba_range)
-        else:
-            num_mamba_reads = num_fa_reads
-            all_source_ranks = list(fa_source_ranks)
-
-        # ---- FA descriptor bytes ----
-        effective_block_len = min(local_block_len, remote_block_len)
-        if self.is_kv_layout_blocks_first:
-            fa_descriptor_bytes = effective_block_len // 2
-        else:
-            fa_descriptor_bytes = effective_block_len
-
-        # ---- Validation ----
-        is_local_replicated = local_tp > K
-        if is_local_replicated and is_remote_replicated and tp_ratio > 0:
-            logger.info(
-                "Both-replicated hetero-TP: local_tp=%d > remote_tp=%d > K=%d.",
-                local_tp,
-                remote_tp_size,
-                K,
-            )
-        tt_set = set(all_source_ranks)
-        for t in fa_source_ranks:
-            if t not in tt_set:
-                logger.error(
-                    "FA source rank %d NOT in all_source_ranks %s.",
-                    t,
-                    all_source_ranks,
-                )
-        if self.is_kv_layout_blocks_first and tp_ratio < 0 and num_fa_reads > 0:
-            local_k_half = local_block_len // 2
-            remote_k_half = remote_block_len // 2
-            expected = local_k_half // num_fa_reads
-            if expected != remote_k_half:
-                logger.warning(
-                    "FA size mismatch: local_k_half=%d / reads=%d = %d, "
-                    "but remote_k_half=%d.",
-                    local_k_half,
-                    num_fa_reads,
-                    expected,
-                    remote_k_half,
-                )
-
-        return MambaEngineTransferInfo(
-            remote_tp_size=remote_tp_size,
-            remote_block_len=remote_block_len,
-            remote_block_size=remote_block_size,
-            remote_physical_blocks_per_logical=(remote_physical_blocks_per_logical),
-            remote_fa_source_ranks=tuple(fa_source_ranks),
-            remote_all_source_ranks=tuple(all_source_ranks),
-            remote_num_fa_reads=num_fa_reads,
-            remote_num_mamba_reads=num_mamba_reads,
-            remote_fa_descriptor_bytes=fa_descriptor_bytes,
-            is_remote_replicated=is_remote_replicated,
-            remote_physical_heads=remote_physical_heads,
+            f"remote_block_len={info.remote_block_len})"
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 6e9e757ffbd5..9a39ec658fff 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -40,7 +40,10 @@
     KVConnectorBase_V1,
     KVConnectorRole,
 )
-from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorMetadata,
+    SupportsHMA,
+)
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backend import AttentionMetadata
@@ -71,7 +74,7 @@ class DecodeBenchConnectorMetadata(KVConnectorMetadata):
     reqs_to_fill: dict[str, tuple[tuple[list[int], ...], int]]
 
 
-class DecodeBenchConnector(KVConnectorBase_V1):
+class DecodeBenchConnector(KVConnectorBase_V1, SupportsHMA):
     """
     A KV Connector for decode instance performance testing.
 
@@ -164,6 +167,17 @@ def request_finished(
         self.connector_scheduler.request_finished(request)
         return False, None
 
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        # HMA-enabled path: same cleanup as the single-group variant since
+        # this connector owns no external state per block.
+        assert self.connector_scheduler is not None
+        self.connector_scheduler.request_finished(request)
+        return False, None
+
 
 class DecodeBenchConnectorScheduler:
     """Scheduler-side implementation for DecodeBenchConnector."""
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
index 715fcbde16c9..608fd8784778 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/mooncake_connector.py
@@ -31,10 +31,14 @@
     KVConnectorRole,
     SupportsHMA,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.mooncake_utils import (
     MooncakeBootstrapServer,
     RegisterWorkerPayload,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.mooncake.stats import (
+    MooncakeKVConnectorStats,
+)
 from vllm.distributed.parallel_state import (
     get_pp_group,
     get_tensor_model_parallel_rank,
@@ -457,6 +461,25 @@ def save_kv_layer(
     def wait_for_save(self):
         pass
 
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
+        """Return worker-local transfer stats since the last call.
+
+        Note the P/D asymmetry: because Mooncake is P-push (P calls
+        batch_transfer_sync_write), P records successful transfer latency,
+        bytes, and descriptor counts, while D only records failures
+        (recv/ZMQ errors). Aggregated NIXL-style dashboards will find
+        successful-transfer metrics on the P worker, not D.
+        """
+        if self.connector_worker is None:
+            return None
+        return self.connector_worker.get_kv_connector_stats()
+
+    @classmethod
+    def build_kv_connector_stats(
+        cls, data: dict[str, Any] | None = None
+    ) -> KVConnectorStats | None:
+        return MooncakeKVConnectorStats(data=data or {})
+
 
 class MooncakeConnectorScheduler:
     """Implementation of Scheduler side methods"""
@@ -816,6 +839,8 @@ def __init__(
         self.finished_sending_reqs: set[ReqId] = set()
         self.finished_recving_reqs: set[ReqId] = set()
 
+        self.xfer_stats = MooncakeKVConnectorStats()
+
         self.block_size = vllm_config.cache_config.block_size
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -1340,11 +1365,23 @@ def _send_blocks(
         ret_value = self.engine.batch_transfer_sync_write(
             remote_session, src_ptrs, dst_ptrs, lengths
         )
+        duration = time.perf_counter() - start_time
         if ret_value == 0:
-            logger.debug(
-                "Sending to %s done, took %s",
+            self.xfer_stats.record_transfer(
+                duration_s=duration,
+                total_bytes=sum(lengths),
+                num_descs=len(src_ptrs),
+            )
+            logger.debug("Sending to %s done, took %s", remote_session, duration)
+        else:
+            self.xfer_stats.record_failed_transfer()
+            logger.warning(
+                "Sending to %s failed (ret=%s) after %s (%d descriptors, %d bytes)",
                 remote_session,
-                time.perf_counter() - start_time,
+                ret_value,
+                duration,
+                len(src_ptrs),
+                sum(lengths),
             )
         return ret_value
 
@@ -1445,6 +1482,7 @@ async def fetch_finished_sending_reqs(self) -> set[ReqId]:
                     send_meta.p_req_id,
                     envs.VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT,
                 )
+                self.xfer_stats.record_kv_expired_req()
                 finished_sending_reqs.add(send_meta.p_req_id)
                 expired_transfer_id.append(transfer_id)
 
@@ -1485,6 +1523,13 @@ def get_finished(self) -> tuple[set[str] | None, set[str] | None]:
 
         return finished_sending_reqs or None, finished_recving_reqs or None
 
+    def get_kv_connector_stats(self) -> KVConnectorStats | None:
+        """Return transfer stats collected since the last call, or None
+        if nothing has been recorded in this interval."""
+        if self.xfer_stats.is_empty():
+            return None
+        return self.xfer_stats.clone_and_reset()
+
     async def receive_kv_from_single_worker(
         self,
         worker_addr: str,
@@ -1531,6 +1576,7 @@ async def receive_kv_from_single_worker(
                             req_ids,
                             response.err_msg,
                         )
+                        self.xfer_stats.record_failed_recv()
                         return
                     self.process_pulling_result(response, pull_metas)
                     if response.status == MooncakeXferResponseStatus.FINISH:
@@ -1539,6 +1585,7 @@ async def receive_kv_from_single_worker(
             logger.debug("ZMQ context terminated, exiting Mooncake receiver thread.")
         except Exception as e:
             logger.error("MooncakeXferMetadata transfer failed for %s: %s", req_ids, e)
+            self.xfer_stats.record_failed_recv()
             return
 
     def process_pulling_result(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py
new file mode 100644
index 000000000000..d177f55cc720
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake/stats.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Stats container for the Mooncake connector."""
+
+import threading
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
+    KVConnectorStats,
+)
+
+# TODO(mooncake-stats): add MooncakePromMetrics (mirror NixlPromMetrics)
+# and wire it via MooncakeConnector.build_prom_metrics in a follow-up PR.
+
+
+@dataclass
+class MooncakeKVConnectorStats(KVConnectorStats):
+    """Container for Mooncake KV transfer performance metrics.
+
+    `_lock` serializes record_* against clone_and_reset so each row's
+    appends are atomic and column lengths stay aligned. Writers run on
+    the sender pool / receiver loop / sender loop; reader runs on the
+    main worker thread.
+    """
+
+    def __post_init__(self):
+        self._lock = threading.Lock()
+        if not self.data:
+            self.reset()
+
+    # threading.Lock is not picklable; strip it from the wire form and
+    # rebuild a fresh per-process lock on the receiver side.
+    def __getstate__(self) -> dict[str, Any]:
+        state = self.__dict__.copy()
+        state.pop("_lock", None)
+        return state
+
+    def __setstate__(self, state: dict[str, Any]) -> None:
+        self.__dict__.update(state)
+        self._lock = threading.Lock()
+
+    def reset(self):
+        self.data: dict[str, list[float | int]] = {
+            "transfer_duration": [],
+            "bytes_transferred": [],
+            "num_descriptors": [],
+            "num_failed_transfers": [],
+            "num_failed_recvs": [],
+            "num_kv_expired_reqs": [],
+        }
+
+    def record_transfer(self, duration_s: float, total_bytes: int, num_descs: int):
+        with self._lock:
+            self.data["transfer_duration"].append(duration_s)
+            self.data["bytes_transferred"].append(total_bytes)
+            self.data["num_descriptors"].append(num_descs)
+
+    # Failure counters store a list of 1s so a future Prom counter can iterate
+    # with .inc(list_item), mirroring NIXL's NixlPromMetrics.observe.
+    def record_failed_transfer(self):
+        with self._lock:
+            self.data["num_failed_transfers"].append(1)
+
+    def record_failed_recv(self):
+        with self._lock:
+            self.data["num_failed_recvs"].append(1)
+
+    def record_kv_expired_req(self):
+        with self._lock:
+            self.data["num_kv_expired_reqs"].append(1)
+
+    def clone_and_reset(self) -> "MooncakeKVConnectorStats":
+        # Copy lists under the lock for length alignment; return a fresh
+        # instance so the snapshot has its own _lock.
+        with self._lock:
+            snapshot_data: dict[str, list[float | int]] = {
+                k: list(v) for k, v in self.data.items()
+            }
+            self.reset()
+        return MooncakeKVConnectorStats(data=snapshot_data)
+
+    def is_empty(self) -> bool:
+        return (
+            self.num_successful_transfers == 0
+            and len(self.data["num_failed_transfers"]) == 0
+            and len(self.data["num_failed_recvs"]) == 0
+            and len(self.data["num_kv_expired_reqs"]) == 0
+        )
+
+    def aggregate(self, other: KVConnectorStats) -> KVConnectorStats:
+        if not other.is_empty():
+            for k, v in other.data.items():
+                accumulator = self.data[k]
+                assert isinstance(accumulator, list)
+                accumulator.extend(v)
+        return self
+
+    def reduce(self) -> dict[str, int | float]:
+        num_failed_transfers = len(self.data["num_failed_transfers"])
+        num_failed_recvs = len(self.data["num_failed_recvs"])
+        num_kv_expired_reqs = len(self.data["num_kv_expired_reqs"])
+
+        if self.num_successful_transfers == 0:
+            return {
+                "Num successful transfers": 0,
+                "Avg xfer time (ms)": 0,
+                "P90 xfer time (ms)": 0,
+                "Avg MB per transfer": 0,
+                "Throughput (MB/s)": 0,
+                "Avg number of descriptors": 0,
+                "Num failed transfers": num_failed_transfers,
+                "Num failed recvs": num_failed_recvs,
+                "Num KV expired reqs": num_kv_expired_reqs,
+            }
+
+        xfer_time = np.asarray(self.data["transfer_duration"])
+        mb = np.asarray(self.data["bytes_transferred"]) / 2**20
+        descs = np.asarray(self.data["num_descriptors"], dtype=np.uint32)
+        n = len(descs)
+        assert n == self.num_successful_transfers
+
+        total_mb = mb.sum()
+        avg_mb = total_mb / n
+        total_time_seconds = xfer_time.sum()
+        throughput_mb_s = (
+            total_mb / total_time_seconds if total_time_seconds > 0 else 0.0
+        )
+
+        return {
+            "Num successful transfers": n,
+            "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
+            "Avg MB per transfer": round(avg_mb, 3),
+            "Throughput (MB/s)": round(throughput_mb_s, 3),
+            "Avg number of descriptors": round(descs.mean(), 1),
+            "Num failed transfers": num_failed_transfers,
+            "Num failed recvs": num_failed_recvs,
+            "Num KV expired reqs": num_kv_expired_reqs,
+        }
+
+    @property
+    def num_successful_transfers(self) -> int:
+        return len(self.data["transfer_duration"])
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
index 71ebbf1174fb..c56e373ba99d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/metadata.py
@@ -32,8 +32,9 @@
 # Version History:
 #   1: Initial version with compatibility checking
 #   2: Add remote_request_id to kv_transfer_params
+#   3: Add physical_blocks_per_logical_kv_block to NixlAgentMetadata
 #
-NIXL_CONNECTOR_VERSION: int = 2
+NIXL_CONNECTOR_VERSION: int = 3
 
 
 @dataclass
@@ -48,6 +49,7 @@ class NixlAgentMetadata:
     block_size: int
     ssm_sizes: tuple[int, int]
     attn_backend_name: str
+    physical_blocks_per_logical_kv_block: int
 
 
 @dataclass
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
new file mode 100644
index 000000000000..7115b8bed543
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/tp_mapping.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""TP mapping computation for NIXL KV cache transfers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.distributed.kv_transfer.kv_connector.utils import (
+    BlockIds,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheSpec, MambaSpec
+
+# ======================================================================
+# Data structures
+# ======================================================================
+
+
+@dataclass(frozen=True)
+class ReadSpec:
+    """Specification for a single remote block read operation."""
+
+    remote_rank: int
+    local_block_ids: BlockIds
+    remote_block_ids: BlockIds
+
+
+def _is_attention_spec(spec_type: type[KVCacheSpec]) -> bool:
+    return issubclass(spec_type, AttentionSpec)
+
+
+def _is_ssm_spec(spec_type: type[KVCacheSpec]) -> bool:
+    return issubclass(spec_type, MambaSpec)
+
+
+@dataclass(frozen=True)
+class TPMapping:
+    """Complete local-to-remote TP mapping for one remote engine.
+
+    Generated once per remote engine during handshake.
+    """
+
+    # Remote TP ranks that this local rank reads from, per group.
+    # Position = local piece index.
+    source_ranks_per_group: tuple[tuple[int, ...], ...]
+
+    # Superset of all source ranks (union of all groups).
+    all_source_ranks: tuple[int, ...]
+
+    # Maps each source rank to its FA head slot index.
+    rank_to_attention_slot: dict[int, int]
+
+    # FA head offset factor for hetero-TP (D_TP > P_TP).
+    rank_offset_factor: int
+
+
+# ======================================================================
+# TP mapping computation
+# ======================================================================
+
+
+def compute_tp_mapping(
+    tp_rank: int,
+    tp_size: int,
+    remote_tp_size: int,
+    is_mla: bool,
+    total_num_kv_heads: int,
+    group_spec_types: tuple[type[KVCacheSpec], ...],
+) -> TPMapping:
+    """Build the complete local-to-remote TP mapping.
+
+    Computes source ranks, head slot assignments, and the rank offset
+    factor in a single pass.
+    """
+    # --- Attention source ranks ---
+    if is_mla:
+        # All heads replicated across all ranks.
+        attn_ranks = [0]
+    elif tp_size >= remote_tp_size:
+        # D (local TP) > P (remote TP): multiple local ranks read different chunks from
+        # *one* remote rank, corresponding to different kv heads.
+        attn_ranks = [tp_rank * remote_tp_size // tp_size]
+    else:
+        # P (remote TP) > D (local TP): one local rank
+        # reads from multiple remote ranks.
+        # GQA dedup: when K < remote_tp_size, several remote ranks
+        # hold the same KV head.  np.unique keeps only the first
+        # rank per unique head so we don't issue redundant reads.
+        abs_tp = remote_tp_size // tp_size
+        start = tp_rank * abs_tp
+        heads = np.arange(start, start + abs_tp) * total_num_kv_heads // remote_tp_size
+        _, unique_idx = np.unique(heads, return_index=True)
+        attn_ranks = (start + np.sort(unique_idx)).tolist()
+
+    # --- SSM source ranks ---
+    has_ssm = any(_is_ssm_spec(t) for t in group_spec_types)
+    if has_ssm:
+        if tp_size < remote_tp_size:
+            abs_tp = remote_tp_size // tp_size
+            ssm_ranks = list(range(tp_rank * abs_tp, (tp_rank + 1) * abs_tp))
+        else:
+            ssm_ranks = list(attn_ranks)
+    else:
+        ssm_ranks = []
+
+    all_ranks = sorted(set(attn_ranks) | set(ssm_ranks))
+
+    # --- Per-group ordered source ranks ---
+    source_ranks_per_group = tuple(
+        tuple(ssm_ranks) if _is_ssm_spec(t) else tuple(attn_ranks)
+        for t in group_spec_types
+    )
+
+    # --- Attention head slots ---
+    head_to_slot: dict[int, int] = {}
+    for i, r in enumerate(attn_ranks):
+        head_to_slot[r * total_num_kv_heads // remote_tp_size] = i
+    rank_to_attention_slot = {
+        r: head_to_slot.get(r * total_num_kv_heads // remote_tp_size, 0)
+        for r in all_ranks
+    }
+
+    # --- Rank offset factor ---
+    if is_mla or tp_size <= remote_tp_size:
+        # We don't index into remote for reading, no offset needed.
+        rank_offset_factor = 0
+    elif tp_size > total_num_kv_heads:
+        local_head = tp_rank * total_num_kv_heads // tp_size
+        p_start = attn_ranks[0] * total_num_kv_heads // remote_tp_size
+        rank_offset_factor = local_head - p_start
+    else:
+        # D TP > P TP: we index into remote to read different heads depending on rank.
+        rank_offset_factor = tp_rank % (tp_size // remote_tp_size)
+
+    return TPMapping(
+        source_ranks_per_group=source_ranks_per_group,
+        all_source_ranks=tuple(all_ranks),
+        rank_to_attention_slot=rank_to_attention_slot,
+        rank_offset_factor=rank_offset_factor,
+    )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
index 607bf4b988ff..caa5f432c5e0 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl/worker.py
@@ -9,6 +9,7 @@
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Iterator
 from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING, Any, cast
 
@@ -21,7 +22,7 @@
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     BlockIds,
     EngineId,
-    MambaEngineTransferInfo,
+    EngineTransferInfo,
     TransferTopology,
     get_current_attn_backends,
     kv_postprocess_blksize_and_layout_on_receive,
@@ -43,13 +44,19 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.stats import (
     NixlKVConnectorStats,
 )
+from vllm.distributed.kv_transfer.kv_connector.v1.nixl.tp_mapping import (
+    ReadSpec,
+    TPMapping,
+    _is_attention_spec,
+    _is_ssm_spec,
+    compute_tp_mapping,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl.utils import (
     _NIXL_SUPPORTED_DEVICE,
     zmq_ctx,
 )
 from vllm.distributed.kv_transfer.kv_connector.v1.ssm_conv_transfer_utils import (
     MambaConvSplitInfo,
-    compute_physical_blocks_per_logical,
     derive_mamba_conv_split,
 )
 from vllm.distributed.nixl_utils import NixlWrapper, nixl_agent_config
@@ -58,7 +65,6 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
-from vllm.model_executor.layers.mamba.mamba_utils import is_conv_state_dim_first
 from vllm.platforms import current_platform
 from vllm.utils.network_utils import make_zmq_path
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
@@ -80,6 +86,111 @@
 class NixlConnectorWorker:
     """Implementation of Worker side methods"""
 
+    def _compute_desc_ids(
+        self,
+        block_ids: BlockIds,
+        dst_num_blocks: int,
+        block_size_ratio: float | None,
+        physical_blocks_per_logical: int,
+    ) -> np.ndarray:
+        """Compute NIXL descriptor IDs for given block IDs."""
+        num_fa_regions = self.num_regions
+        num_ssm_regions = len(self.block_len_per_layer) * 4 if self._has_mamba else 0
+
+        num_blocks = dst_num_blocks
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
+        ratio = physical_blocks_per_logical
+        logical_blocks = num_blocks // ratio
+
+        num_fa_descs = num_fa_regions * num_blocks
+
+        # All-attention fast path: single vectorized broadcast.
+        if num_ssm_regions == 0:
+            block_arr = np.concatenate(block_ids)[None, :]
+            region_ids = np.arange(num_fa_regions)[:, None]
+            return (region_ids * num_blocks + block_arr).flatten()
+
+        # NOTE (NickLucche) With HMA, every kv group has the same number
+        # of layers and layers from different groups share the same kv
+        # tensor.  Therefore we compute desc IDs per group using the
+        # right stride:
+        # FA descs have num_blocks entries per region (kernel granularity),
+        # SSM descs have logical_blocks entries per region (no kernel
+        # splitting).
+        all_descs: list[np.ndarray] = []
+        for i, group in enumerate(block_ids):
+            group_arr = np.asarray(group)
+            if _is_attention_spec(self._group_spec_types[i]):
+                fa_region_ids = np.arange(num_fa_regions)[:, None]
+                all_descs.append(
+                    (fa_region_ids * num_blocks + group_arr[None, :]).flatten()
+                )
+            elif _is_ssm_spec(self._group_spec_types[i]):
+                # NOTE (NickLucche) SSM and Attention block regions can
+                # be exchanged arbitrarily by manager.  Therefore, descs
+                # are laid out as:
+                #   [descs_fa (all regions) | descs_ssm (all regions)].
+                # num_fa_descs offset must be computed per-engine since
+                # P and D can have different num_blocks (and thus
+                # different FA desc counts).
+                ssm_region_ids = np.arange(num_ssm_regions)[:, None]
+                all_descs.append(
+                    (
+                        ssm_region_ids * logical_blocks
+                        + group_arr[None, :]
+                        + num_fa_descs
+                    ).flatten()
+                )
+            else:
+                raise ValueError(
+                    f"Unknown spec type {self._group_spec_types[i]} at index {i}"
+                )
+
+        return np.concatenate(all_descs)
+
+    def _build_local_splits_from_plan(
+        self,
+        plan: TPMapping,
+        src_blocks_data: list[tuple[int, int, int]],
+        num_fa_descs: int,
+    ) -> Iterator[list[tuple[int, int, int]]]:
+        """Build split handle data for P_TP > D_TP scenario.
+
+        num_fa_descs is the boundary between FA and SSM descriptors.
+        Split counts are derived from source_ranks_per_group lengths.
+        FA uses rank_to_attention_slot for the slot offset;
+        SSM uses the rank's positional index.
+        """
+        fa_idx = next(
+            i for i, t in enumerate(self._group_spec_types) if _is_attention_spec(t)
+        )
+        fa_num_splits = len(plan.source_ranks_per_group[fa_idx])
+
+        has_ssm_descs = num_fa_descs < len(src_blocks_data)
+        ssm_idx = next(
+            (i for i, t in enumerate(self._group_spec_types) if _is_ssm_spec(t)),
+            None,
+        )
+        ssm_num_splits = (
+            len(plan.source_ranks_per_group[ssm_idx])
+            if has_ssm_descs and ssm_idx is not None
+            else 0
+        )
+
+        for p_idx, p_rank in enumerate(plan.all_source_ranks):
+            fa_slot = plan.rank_to_attention_slot.get(p_rank, 0)
+
+            handle: list[tuple[int, int, int]] = []
+            for j, (addr, local_len, dev) in enumerate(src_blocks_data):
+                if j < num_fa_descs:
+                    chunk = local_len // fa_num_splits
+                    handle.append((addr + fa_slot * chunk, chunk, dev))
+                else:
+                    chunk = local_len // ssm_num_splits
+                    handle.append((addr + p_idx * chunk, chunk, dev))
+            yield handle
+
     def __init__(
         self,
         vllm_config: "VllmConfig",
@@ -119,44 +230,37 @@ def __init__(
         }
         self.hma_group_size = len(kv_cache_config.kv_cache_tensors)
 
-        # ---- Mamba model state (derived from model config) ----
-        self._is_mamba_group = [
-            isinstance(group.kv_cache_spec, MambaSpec)
-            for group in kv_cache_config.kv_cache_groups
-        ]
+        # ---- Model state (derived from model config) ----
         mamba_ssm_size = (0, 0)
-        self._has_mamba = any(self._is_mamba_group)
-        if self._has_mamba:
-            assert self._is_hma_required
-            mamba_spec = next(
-                spec
-                for spec in self._layer_specs.values()
-                if isinstance(spec, MambaSpec)
-            )
-            conv_nbytes, ssm_nbytes = (
-                torch.tensor([], dtype=mamba_spec.dtypes[0]).element_size(),  # type: ignore[misc]
-                torch.tensor([], dtype=mamba_spec.dtypes[1]).element_size(),  # type: ignore[misc]
-            )
-            conv_shape, ssm_shape = (
-                torch.Size(mamba_spec.shapes[0]),
-                torch.Size(mamba_spec.shapes[1]),
-            )
-            mamba_ssm_size = (
-                conv_shape.numel() * conv_nbytes,
-                ssm_shape.numel() * ssm_nbytes,
-            )
-        self._mamba_ssm_size = mamba_ssm_size
         # Conv state sub-projection decomposition (None when no Mamba).
         # The 3-read transfer requires DS (dim, state_len) conv layout so
         # that x/B/C sub-projections are contiguous in memory.
         self._conv_decomp: MambaConvSplitInfo | None = None
+        self._has_mamba = any(
+            isinstance(g.kv_cache_spec, MambaSpec)
+            for g in kv_cache_config.kv_cache_groups
+        )
         if self._has_mamba:
+            assert self._is_hma_required
+            from vllm.model_executor.layers.mamba.mamba_utils import (
+                is_conv_state_dim_first,
+            )
+
             assert is_conv_state_dim_first(), (
                 "3-read Mamba conv transfer requires DS conv state layout. "
                 "Set VLLM_SSM_CONV_STATE_LAYOUT=DS"
             )
-            local_tp = vllm_config.parallel_config.tensor_parallel_size
-            self._conv_decomp = derive_mamba_conv_split(mamba_spec, local_tp)
+            mamba_spec = next(
+                spec
+                for spec in self._layer_specs.values()
+                if isinstance(spec, MambaSpec)
+            )
+            self._conv_decomp = derive_mamba_conv_split(
+                mamba_spec,
+                vllm_config.parallel_config.tensor_parallel_size,
+            )
+            mamba_ssm_size = self._conv_decomp.ssm_sizes
+        self._mamba_ssm_size = mamba_ssm_size
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -268,14 +372,6 @@ def __init__(
         self.dst_num_blocks: dict[EngineId, int] = {}
         self._registered_descs: list[Any] = []
 
-        # ---- Mamba-HMA per-engine state (only used when self._has_mamba) ----
-        # NOTE (ZhanqiuHu): _physical_blocks_per_logical MUST be per-engine.
-        # physical_blocks_per_logical = ceil((conv_bytes + ssm_bytes) / block_len)
-        # where conv/ssm bytes are per-TP-rank (dimension-sharded).  With
-        # heterogeneous TP the per-rank sizes differ, so the ratio differs:
-        #   e.g. Nemotron 30B: P(TP=4) → 131, D(TP=1) → 261.
-        self._physical_blocks_per_logical: dict[EngineId, int] = {}
-
         # In progress transfers.
         # [req_id -> list[handle]]
         self._recving_metadata: dict[ReqId, ReqMeta] = {}
@@ -330,6 +426,13 @@ def __init__(
         self._physical_blocks_per_logical_kv_block = 1
         self._sync_block_size_with_kernel()
 
+        self._group_spec_types = tuple(
+            type(g.kv_cache_spec) for g in self.kv_cache_config.kv_cache_groups
+        )
+
+        # Per-engine TP mappings. Generated during handshake.
+        self.tp_mappings: dict[EngineId, TPMapping] = {}
+
         self.enforce_compat_hash = self.kv_transfer_config.get_from_extra_config(
             "enforce_handshake_compat", True
         )
@@ -812,9 +915,6 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         self.dst_num_blocks[self.engine_id] = self.num_blocks
 
         if self._has_mamba:
-            self._physical_blocks_per_logical[self.engine_id] = (
-                self._physical_blocks_per_logical_kv_block
-            )
             logger.info(
                 "Hybrid SSM registration: num_blocks=%s, "
                 "logical_num_blocks=%s, ratio=%s, num_regions=%s, "
@@ -847,6 +947,9 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             block_size=self.block_size,
             ssm_sizes=self._mamba_ssm_size,
             attn_backend_name=self.backend_name,
+            physical_blocks_per_logical_kv_block=(
+                self._physical_blocks_per_logical_kv_block
+            ),
         )
         # Wrap metadata in payload with hash for defensive decoding
         assert self.compat_hash is not None
@@ -875,6 +978,8 @@ def _build_mamba_local(
 
         result: list[tuple[int, int, int]] = []
         for i, base_addr in enumerate(base_addresses):
+            # Jump one page_size, but ssm page_size may be bigger when kernel
+            # locks block size to a specific value (physical_per_logical scale).
             page_stride = (
                 self.block_len_per_layer[i] // block_size_ratio * physical_per_logical
             )
@@ -894,67 +999,11 @@ def _build_mamba_local(
                 )
         return result
 
-    def _build_fa_remote_for_mamba(
-        self,
-        nixl_agent_meta: NixlAgentMetadata,
-        block_size_ratio: int,
-        transfer_topo: TransferTopology,
-        remote_engine_id: EngineId,
-    ) -> list[tuple[int, int, int]]:
-        """Build remote FA descriptors for mamba models.
-
-        Uses TransferTopology for GQA-aware FA divisor and head-based rank
-        offset instead of the standard uniform tp_ratio split.
-        """
-        assert block_size_ratio == 1, (
-            "Mamba 3-read transfer with block_size_ratio != 1 is not tested. "
-            f"Got block_size_ratio={block_size_ratio}."
-        )
-        # TODO (ZhanqiuHu): unify with register_remote_blocks when Mamba-HMA
-        # hetero-TP logic stabilizes.
-        mamba_info = transfer_topo.get_engine_info(remote_engine_id)
-        assert isinstance(mamba_info, MambaEngineTransferInfo)
-        tp_ratio = transfer_topo.tp_ratio(mamba_info.remote_tp_size)
-        result: list[tuple[int, int, int]] = []
-        for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-            local_block_len = self.get_backend_aware_kv_block_len(
-                layer_idx=i, first_split=True, mamba_view=False
-            )
-            remote_kv_block_len = local_block_len // block_size_ratio
-            if block_size_ratio > 1:
-                local_block_len = remote_kv_block_len
-
-            if tp_ratio < 0 and not self.use_mla:
-                local_block_len = local_block_len // mamba_info.remote_num_fa_reads
-
-            rank_offset = transfer_topo.fa_rank_offset(
-                remote_engine_id, remote_kv_block_len
-            )
-
-            num_blocks = nixl_agent_meta.num_blocks
-            page_size = nixl_agent_meta.block_lens[i]
-            for block_id in range(num_blocks):
-                block_offset = block_id * page_size
-                addr = base_addr + block_offset + rank_offset
-                result.append((addr, local_block_len, nixl_agent_meta.device_id))
-
-            if transfer_topo.is_kv_layout_blocks_first:
-                second_split = self.get_backend_aware_kv_block_len(
-                    layer_idx=i, first_split=False, mamba_view=False
-                )
-                if tp_ratio < 0 and not self.use_mla:
-                    second_split = second_split // mamba_info.remote_num_fa_reads
-                for block_id in range(num_blocks):
-                    block_offset = block_id * page_size
-                    addr = base_addr + block_offset + rank_offset
-                    v_addr = addr + nixl_agent_meta.block_lens[i] // 2
-                    result.append((v_addr, second_split, nixl_agent_meta.device_id))
-        return result
-
     def _build_mamba_remote(
         self,
         nixl_agent_meta: NixlAgentMetadata,
         tp_ratio: int,
+        transfer_info: EngineTransferInfo,
     ) -> list[tuple[int, int, int]]:
         """Build 4 remote desc regions (x, B, C, ssm) per layer for
         the 3-read transfer.  For hetero-TP, each D rank reads only its
@@ -974,17 +1023,15 @@ def _build_mamba_remote(
             ssm_read_size = self._mamba_ssm_size[1]
         else:
             # NOTE (ZhanqiuHu): tp_ratio < 0 means P_TP > D_TP, so P pages
-            # are smaller than D's.  self._conv_decomp has D-sized dimensions,
-            # but we need P-sized offsets.  Scale down by |tp_ratio|.
+            # are smaller than D's. self._conv_decomp has D-sized dimensions,
+            # but we need P-sized offsets. Scale down by |tp_ratio|.
             abs_ratio = -tp_ratio
             xb_p = self._conv_decomp.x_bytes // abs_ratio
             bb_p = self._conv_decomp.b_bytes // abs_ratio
             conv_offsets = [(0, xb_p), (xb_p, bb_p), (xb_p + bb_p, bb_p)]
             ssm_read_size = nixl_agent_meta.ssm_sizes[1]
 
-        remote_physical_per_logical = self._physical_blocks_per_logical[
-            nixl_agent_meta.engine_id
-        ]
+        remote_physical_per_logical = transfer_info.remote_physical_blocks_per_logical
         num_blocks = nixl_agent_meta.num_blocks // remote_physical_per_logical
         device_id = nixl_agent_meta.device_id
 
@@ -1007,6 +1054,91 @@ def _build_mamba_remote(
                 result.append((ssm_addr, ssm_read_size, device_id))
         return result
 
+    def _build_fa_local(
+        self,
+        base_addresses: list[int],
+        block_size_ratio: int,
+    ) -> list[tuple[int, int, int]]:
+        """Build local FA descriptors for all layers."""
+        assert self.transfer_topo is not None
+        num_blocks = self.num_blocks * block_size_ratio
+        result: list[tuple[int, int, int]] = []
+        for i, base_addr in enumerate(base_addresses):
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=True, mamba_view=False
+                )
+                // block_size_ratio
+            )
+            page_stride = self.block_len_per_layer[i] // block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * page_stride
+                addr = base_addr + block_offset
+                result.append((addr, kv_block_len, self.device_id))
+
+            if self.transfer_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                second_split = self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=False, mamba_view=False
+                )
+                for block_id in range(num_blocks):
+                    block_offset = block_id * page_stride
+                    addr = base_addr + block_offset
+                    v_addr = addr + kv_block_len
+                    result.append((v_addr, second_split, self.device_id))
+        return result
+
+    def _build_fa_remote(
+        self,
+        plan: TPMapping,
+        nixl_agent_meta: NixlAgentMetadata,
+        block_size_ratio: int,
+    ) -> list[tuple[int, int, int]]:
+        """Build remote FA descriptors for all layers."""
+        assert self.transfer_topo is not None
+        fa_group_idx = next(
+            i for i, t in enumerate(self._group_spec_types) if _is_attention_spec(t)
+        )
+        num_attn_reads = len(plan.source_ranks_per_group[fa_group_idx])
+        num_blocks = nixl_agent_meta.num_blocks
+        result: list[tuple[int, int, int]] = []
+        for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
+            # Read our whole local region size from remote..
+            local_block_len = self.get_backend_aware_kv_block_len(
+                layer_idx=i, first_split=True, mamba_view=False
+            )
+            remote_kv_block_len = local_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # ..using remote kv_block_len as transfer unit
+                local_block_len = remote_kv_block_len
+
+            local_block_len = local_block_len // num_attn_reads
+            rank_offset = plan.rank_offset_factor * remote_kv_block_len
+
+            page_size = nixl_agent_meta.block_lens[i]
+            for block_id in range(num_blocks):
+                block_offset = block_id * page_size
+                # For each block, grab the kv heads chunk belonging to current local
+                # tp rank of size local_block_len.
+                addr = base_addr + block_offset + rank_offset
+                result.append((addr, local_block_len, nixl_agent_meta.device_id))
+
+            if self.transfer_topo.is_kv_layout_blocks_first:
+                # With FlashInfer index V separately to allow head splitting.
+                second_split = self.get_backend_aware_kv_block_len(
+                    layer_idx=i, first_split=False, mamba_view=False
+                )
+                second_split = second_split // num_attn_reads
+                for block_id in range(num_blocks):
+                    block_offset = block_id * page_size
+                    addr = base_addr + block_offset + rank_offset
+                    # Hop over the first split of remote page, K, to read V.
+                    v_addr = addr + nixl_agent_meta.block_lens[i] // 2
+                    result.append((v_addr, second_split, nixl_agent_meta.device_id))
+        return result
+
     def register_local_xfer_handler(
         self,
         block_size: int,
@@ -1023,68 +1155,23 @@ def register_local_xfer_handler(
         data copy correctness.
         """
         assert self.transfer_topo is not None
-        transfer_topo = self.transfer_topo
-
         block_size_ratio = self.block_size // block_size
-        blocks_data: list[tuple[int, int, int]] = []
         local_base_addresses = self.kv_caches_base_addr[self.engine_id][self.tp_rank]
 
-        def register_blocks(blocks_data: list[tuple[int, int, int]], mamba: bool):
-            for i, base_addr in enumerate(local_base_addresses):
-                # The new block_len is using prefill block_len;
-                # and num_blocks is multiple with N
-                kv_block_len = (
-                    self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=True, mamba_view=mamba
-                    )
-                    // block_size_ratio
-                )
-                # Jump one page_size, but ssm page_size may be bigger when kernel
-                # locks block size to a specific value.
-                block_len_per_layer = (
-                    self.block_len_per_layer[i]
-                    // block_size_ratio
-                    * (1 if not mamba else self._physical_blocks_per_logical_kv_block)
-                )
-                num_blocks = self._logical_num_blocks if mamba else self.num_blocks
-                num_blocks = num_blocks * block_size_ratio
-                for block_id in range(num_blocks):
-                    block_offset = block_id * block_len_per_layer
-                    addr = base_addr + block_offset
-                    # (addr, len, device id)
-                    blocks_data.append((addr, kv_block_len, self.device_id))
-
-                if transfer_topo.is_kv_layout_blocks_first:
-                    second_split = self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=False, mamba_view=mamba
-                    )
-                    # Separate and interleave K/V regions to maintain the same
-                    # descs ordering. This is needed for selecting contiguous heads
-                    # when split across TP ranks.
-                    for block_id in range(num_blocks):
-                        block_offset = block_id * block_len_per_layer
-                        addr = base_addr + block_offset
-                        # Register addresses for V cache (K registered first).
-                        v_addr = addr + kv_block_len
-                        blocks_data.append((v_addr, second_split, self.device_id))
-            logger.debug(
-                "Created %s blocks for src engine %s and rank %s on device id %s",
-                len(blocks_data),
-                self.engine_id,
-                self.tp_rank,
-                self.device_id,
-            )
-
-        # NOTE (ZhanqiuHu): mamba=True path in register_blocks is not used
-        # right now — we use _build_mamba_local instead for the 3-read
-        # approach. However, we might still need this as a fallback for homogeneous TP.
-        register_blocks(blocks_data, mamba=False)
+        blocks_data = self._build_fa_local(local_base_addresses, block_size_ratio)
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
+        )
         if self._has_mamba:
             assert self.num_descs == len(blocks_data)
-            # TODO (ZhanqiuHu): For homogeneous TP (tp_ratio == 1), the 3-read split is
-            # unnecessary — a single conv desc per block suffices.  Consider
+            # TODO (ZhanqiuHu): For homogeneous TP (tp_ratio == 1), the 3-descs split
+            # is unnecessary — a single conv desc per block suffices.  Consider
             # adding a fast path that falls back to the standard 2-region
-            # registration (register_blocks mamba=True) when no hetero-TP
+            # registration (_build_fa_local mamba=True) when no hetero-TP
             # remote has been seen.  Currently we always register 4 regions
             # because local descs are created before knowing the remote TP.
             logger.debug("Registering local Mamba descriptors (4 regions/layer)")
@@ -1160,26 +1247,26 @@ def add_remote_agent(
         assert self.transfer_topo is not None
         transfer_topo = self.transfer_topo
         physical_blocks_per_logical = (
-            compute_physical_blocks_per_logical(
-                nixl_agent_meta.ssm_sizes,
-                nixl_agent_meta.block_lens[0],
-            )
-            if self._has_mamba
-            else 1
+            nixl_agent_meta.physical_blocks_per_logical_kv_block
         )
-        transfer_topo.register_remote_engine(
-            remote_engine_id=engine_id,
+        transfer_info = EngineTransferInfo(
             remote_tp_size=remote_tp_size,
             remote_block_size=nixl_agent_meta.block_size,
             remote_block_len=nixl_agent_meta.block_lens[0],
             remote_physical_blocks_per_logical=physical_blocks_per_logical,
-            local_block_len=self.block_len_per_layer[0],
         )
-        if self._has_mamba and engine_id not in self._physical_blocks_per_logical:
-            self._physical_blocks_per_logical[engine_id] = physical_blocks_per_logical
-
+        transfer_topo.register_remote_engine(engine_id, transfer_info)
         logger.info("Transfer plan: %s", transfer_topo.describe(engine_id))
 
+        self.tp_mappings[engine_id] = compute_tp_mapping(
+            transfer_topo.tp_rank,
+            transfer_topo.tp_size,
+            transfer_info.remote_tp_size,
+            transfer_topo.is_mla,
+            transfer_topo.total_num_kv_heads,
+            self._group_spec_types,
+        )
+
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
             nixl_agent_meta.agent_metadata
         )
@@ -1206,11 +1293,6 @@ def add_remote_agent(
         # this is the ratio between the two sizes.
         tp_ratio = transfer_topo.tp_ratio(remote_tp_size)
 
-        # Handle tp_size>num_kv_heads: replicate KV cache.
-        indexes_into_remote = (
-            not transfer_topo.replicates_kv_cache(engine_id) and tp_ratio > 0
-        )
-
         logger.debug(
             "Registering remote agent (%s, rank %s) memory regions with tp_ratio %s",
             engine_id,
@@ -1218,6 +1300,8 @@ def add_remote_agent(
             tp_ratio,
         )
 
+        plan = self.tp_mappings[engine_id]
+
         ### (Optional) Register local agent memory regions. MLA is not split.
         if (
             tp_ratio < 0
@@ -1227,149 +1311,51 @@ def add_remote_agent(
             # Remote tp_size > local tp_size: read from multiple remote ranks.
             # Logically "split" own regions into |tp_ratio| chunks. Mind that
             # we only do this once per remote tp_size (replica-friendly).
-            abs_tp = -tp_ratio
             self.src_xfer_handles_by_tp_ratio[tp_ratio] = []
 
-            if self._has_mamba:
-                if transfer_topo.needs_split_handles(engine_id):
-                    # Mamba-HMA: FA and Mamba use different split factors.
-                    for handle_data in transfer_topo.compute_split_handle_data(
-                        engine_id, self.src_blocks_data, self.num_descs, abs_tp
-                    ):
-                        descs = self.nixl_wrapper.get_xfer_descs(
-                            handle_data, self.nixl_memory_type
-                        )
-                        handle = self.nixl_wrapper.prep_xfer_dlist(
-                            "NIXL_INIT_AGENT", descs
-                        )
-                        self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
-
-                    logger.info(
-                        "Mamba-HMA split handles: %s, num_descs=%s",
-                        transfer_topo.describe(engine_id),
-                        self.num_descs,
-                    )
-            else:
-                # Original path: uniform divide by abs_tp (non-Mamba-HMA).
-                for i in range(abs_tp):
-                    blocks_data = []
-                    for memory_region in self.src_blocks_data:
-                        addr, local_block_len, own_tp_rank = memory_region
-                        remote_block_len = local_block_len // abs_tp
-                        addr = addr + i * remote_block_len
-                        blocks_data.append((addr, remote_block_len, own_tp_rank))
-                    descs = self.nixl_wrapper.get_xfer_descs(
-                        blocks_data, self.nixl_memory_type
-                    )
-                    handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
-                    self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
+            for handle_data in self._build_local_splits_from_plan(
+                plan,
+                self.src_blocks_data,
+                self.num_descs,
+            ):
+                descs = self.nixl_wrapper.get_xfer_descs(
+                    handle_data, self.nixl_memory_type
+                )
+                handle = self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+                self.src_xfer_handles_by_tp_ratio[tp_ratio].append(handle)
 
         ### Register remote agent memory regions
-        blocks_data = []
-        # With homogeneous TP, D pulls the whole kv cache from corresponding
-        # rank. With heterogeneous TP, prepare the descriptors by splitting the
-        # P KV cache along kv_head dim, of D worker's kv_head size (D>P).
+        # With homogeneous TP, D pulls the whole kv cache from corresponding rank. With
+        # heterogeneous TP, prepare the descriptors by splitting the P KV cache along
+        # kv_head dim, of D worker's kv_head size (D>P).
         # Eg. PTP1 DTP2 => P0 KV:[block0-KV_0 | block0-KV_1..].
 
         # Register all remote blocks, but only the corresponding kv heads.
-        def register_remote_blocks(
-            blocks_data: list[tuple[int, int, int]], mamba: bool
-        ):
-            for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
-                # Read our whole local region size from remote.
-                local_block_len = self.get_backend_aware_kv_block_len(
-                    layer_idx=i, first_split=True, mamba_view=mamba
-                )
-                remote_kv_block_len = local_block_len // block_size_ratio
-                if block_size_ratio > 1:
-                    # using remote kv_block_len as transfer unit
-                    local_block_len = remote_kv_block_len
-
-                if tp_ratio < 0 and not self.use_mla:
-                    # Remote tp is bigger: read a chunk of local region from remote
-                    local_block_len = local_block_len // (-tp_ratio)
-                rank_offset = (
-                    self.tp_rank % tp_ratio * remote_kv_block_len
-                    if indexes_into_remote
-                    else 0
-                )
-
-                # Assume same num_blocks for mamba and fa
-                num_blocks = (
-                    nixl_agent_meta.num_blocks
-                    if not mamba
-                    else nixl_agent_meta.num_blocks
-                    // self._physical_blocks_per_logical_kv_block
-                )
-                page_size = nixl_agent_meta.block_lens[i] * (
-                    1 if not mamba else self._physical_blocks_per_logical_kv_block
-                )
-                for block_id in range(num_blocks):
-                    block_offset = block_id * page_size
-                    # For each block, grab the heads chunk belonging to rank_i
-                    # of size remote_nheads // tp_ratio, which correspond to
-                    # self.block_len == remote_block_len//tp_ratio bytes.
-                    addr = base_addr + block_offset + rank_offset
-                    # (addr, len, device id)
-                    blocks_data.append(
-                        (addr, local_block_len, nixl_agent_meta.device_id)
-                    )
-
-                if transfer_topo.is_kv_layout_blocks_first:
-                    # With FlashInfer index V separately to allow head splitting.
-                    second_split = self.get_backend_aware_kv_block_len(
-                        layer_idx=i, first_split=False, mamba_view=mamba
-                    )
-                    # Apply the same scaling as local_block_len above for when we read
-                    # a chunk of local V from `tp_ratio` separate remote workers.
-                    if tp_ratio < 0 and not self.use_mla:
-                        second_split = second_split // (-tp_ratio)
-                    for block_id in range(num_blocks):
-                        block_offset = block_id * page_size
-                        addr = base_addr + block_offset + rank_offset
-                        # Hop over the first split of remote page: either K or Conv.
-                        if mamba:
-                            v_addr = addr + nixl_agent_meta.ssm_sizes[0]
-                        else:
-                            v_addr = addr + nixl_agent_meta.block_lens[i] // 2
-                        blocks_data.append(
-                            (v_addr, second_split, nixl_agent_meta.device_id)
-                        )
-
-            logger.debug(
-                "Created %s blocks for dst engine %s"
-                " with remote rank %s and local rank %s",
-                len(blocks_data),
-                engine_id,
-                remote_tp_rank,
-                self.tp_rank,
-            )
-
+        blocks_data = self._build_fa_remote(
+            plan,
+            nixl_agent_meta,
+            block_size_ratio,
+        )
+        logger.debug(
+            "Created %s blocks for dst engine %s with remote rank %s and local rank %s",
+            len(blocks_data),
+            engine_id,
+            remote_tp_rank,
+            self.tp_rank,
+        )
         if self._has_mamba:
-            # Mamba-HMA: separate FA registration with GQA-aware sizing,
-            # plus mamba 3-read registration for the Mamba "view" of the
-            # same KV cache tensors.
             logger.debug(
                 "Registering remote Mamba blocks for engine %s rank %s",
                 engine_id,
                 remote_tp_rank,
             )
-            blocks_data.extend(
-                self._build_fa_remote_for_mamba(
-                    nixl_agent_meta,
-                    block_size_ratio,
-                    transfer_topo,
-                    engine_id,
-                )
-            )
             blocks_data.extend(
                 self._build_mamba_remote(
                     nixl_agent_meta,
                     tp_ratio,
+                    transfer_info,
                 )
             )
-        else:
-            register_remote_blocks(blocks_data, mamba=False)
 
         # Register with NIXL.
         descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
@@ -1897,33 +1883,49 @@ def start_load_kv(self, metadata: NixlConnectorMetadata):
     def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         assert meta.remote is not None and self.transfer_topo is not None
         engine_id = meta.remote.engine_id
-        remote_ranks = self.transfer_topo.target_remote_ranks(engine_id)
+        plan = self.tp_mappings[engine_id]
         remote_info = self.transfer_topo.get_engine_info(engine_id)
         tp_ratio = self.transfer_topo.tp_ratio(remote_info.remote_tp_size)
 
-        if self._has_mamba:
-            # Expand remote logical → kernel block IDs.
-            meta.remote.block_ids = self._logical_to_remote_kernel_block_ids(
-                meta.remote.block_ids,
-                self._physical_blocks_per_logical[meta.remote.engine_id],
-            )
-        else:
-            meta.remote.block_ids = self._logical_to_kernel_block_ids(
-                meta.remote.block_ids
+        meta.remote.block_ids = self._logical_to_remote_kernel_block_ids(
+            meta.remote.block_ids,
+            remote_info.remote_physical_blocks_per_logical,
+        )
+        remote_block_ids = meta.remote.block_ids
+        local_block_ids = meta.local_physical_block_ids
+        num_groups = len(local_block_ids)
+        read_specs = [
+            ReadSpec(
+                remote_rank=rank,
+                local_block_ids=[
+                    list(local_block_ids[g])
+                    if rank in plan.source_ranks_per_group[g]
+                    else []
+                    for g in range(num_groups)
+                ],
+                remote_block_ids=[
+                    list(remote_block_ids[g])
+                    if rank in plan.source_ranks_per_group[g]
+                    else []
+                    for g in range(num_groups)
+                ],
             )
+            for rank in plan.all_source_ranks
+        ]
+
         # D may have to perform multiple reads from different remote ranks.
-        for i, remote_rank in enumerate(remote_ranks):
-            if self.use_mla and tp_ratio < 0 and i > 0:
-                # MLA opt: when P TP > D TP, only a single read is executed for
-                # the first remote rank (cache is duplicated)..
-                break
+        # MLA opt: when P TP > D TP, only a single read is executed for
+        # the first remote rank (cache is duplicated).
+        if self.use_mla and tp_ratio < 0:
+            read_specs = read_specs[:1]
 
+        for i, spec in enumerate(read_specs):
             remote_block_size = remote_info.remote_block_size
             logger.debug(
                 "Remote agent %s available, calling _read_blocks"
                 " on remote rank %s with remote block size %s for req %s",
                 meta.remote.engine_id,
-                remote_rank,
+                spec.remote_rank,
                 remote_block_size,
                 req_id,
             )
@@ -1942,49 +1944,34 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
 
             # Destination handle: remote_engine_id -> remote_rank -> handle.
             remote_xfer_side_handle = self.dst_xfer_side_handles[meta.remote.engine_id][
-                remote_rank
+                spec.remote_rank
             ]
 
-            local_ids: BlockIds = meta.local_physical_block_ids
-            remote_ids: BlockIds = meta.remote.block_ids
-            if self._has_mamba:
-                # Mamba-HMA: zero out FA groups for P ranks outside fa_read_targets.
-                local_ids, remote_ids = self.transfer_topo.filter_block_ids_for_rank(
-                    engine_id,
-                    remote_rank,
-                    local_ids,
-                    remote_ids,
-                    self._is_mamba_group,
-                )
-
             self._read_blocks(
+                read_spec=spec,
                 request_id=req_id,
                 dst_engine_id=meta.remote.engine_id,
                 remote_request_id=meta.remote.request_id,
-                local_block_ids=local_ids,
-                remote_block_ids=remote_ids,
-                remote_rank=remote_rank,
                 local_xfer_side_handle=local_xfer_side_handle,
                 remote_xfer_side_handle=remote_xfer_side_handle,
             )
 
-            if self.use_mla and tp_ratio < 0:
-                # ..but we still need to notify the other remote ranks that we
-                # have the blocks we need so they can update the request state.
-                notif_id = f"{meta.remote.request_id}:{self.world_size}".encode()
-                remote_agents = self._remote_agents[meta.remote.engine_id]
-                for rank_to_notify, agent in remote_agents.items():
-                    if rank_to_notify != remote_rank:
-                        self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
+        if self.use_mla and tp_ratio < 0 and read_specs:
+            # ..but we still need to notify the other remote ranks that we
+            # have the blocks we need so they can update the request state.
+            notif_id = f"{req_id}:{self.world_size}".encode()
+            remote_agents = self._remote_agents[meta.remote.engine_id]
+            read_ranks = {s.remote_rank for s in read_specs}
+            for rank_to_notify, agent in remote_agents.items():
+                if rank_to_notify not in read_ranks:
+                    self.nixl_wrapper.send_notif(agent, notif_msg=notif_id)
 
     def _read_blocks(
         self,
-        local_block_ids: BlockIds,
-        remote_block_ids: BlockIds,
+        read_spec: ReadSpec,
         dst_engine_id: str,
         request_id: str,
         remote_request_id: str,
-        remote_rank: int,
         local_xfer_side_handle: int,
         remote_xfer_side_handle: int,
     ):
@@ -1993,6 +1980,10 @@ def _read_blocks(
         a single remote worker.
         """
         assert self.transfer_topo is not None
+        remote_rank = read_spec.remote_rank
+        local_block_ids = read_spec.local_block_ids
+        remote_block_ids = read_spec.remote_block_ids
+
         remote_info = self.transfer_topo.get_engine_info(dst_engine_id)
         block_size_ratio = self.transfer_topo.block_size_ratio(
             remote_info.remote_block_size
@@ -2061,16 +2052,14 @@ def _read_blocks(
             == len(local_block_ids)
             == len(self.kv_cache_config.kv_cache_groups)
         )
+        # Partial prefix cache hit: just read uncomputed blocks.
+        # Skip mamba groups — their blocks represent full state (conv+ssm),
+        # not per-token data, so trimming would corrupt the transfer.
         remote_block_ids = list(remote_block_ids)
         for i, remote_group in enumerate(remote_block_ids):
-            num_remote_blocks = len(remote_group)
             num_local_blocks = len(local_block_ids[i])
-            if not self._is_mamba_group[i]:
-                assert num_local_blocks <= num_remote_blocks
-            # Partial prefix cache hit: just read uncomputed blocks.
-            # Skip mamba groups — their blocks represent full state (conv+ssm),
-            # not per-token data, so trimming would corrupt the transfer.
-            if num_local_blocks < num_remote_blocks and not self._is_mamba_group[i]:
+            assert num_local_blocks <= len(remote_group)
+            if num_local_blocks < len(remote_group):
                 remote_block_ids[i] = remote_group[-num_local_blocks:]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -2078,14 +2067,17 @@ def _read_blocks(
         # workers will issue xfers to parts of the P worker remote kv caches.
 
         # Get descs ids.
-        remote_block_descs_ids = self._get_block_descs_ids(
-            dst_engine_id,
-            remote_block_ids,
+        remote_block_descs_ids = self._compute_desc_ids(
+            block_ids=remote_block_ids,
+            dst_num_blocks=self.dst_num_blocks[dst_engine_id],
+            block_size_ratio=None,
+            physical_blocks_per_logical=remote_info.remote_physical_blocks_per_logical,
         )
-        local_block_descs_ids = self._get_block_descs_ids(
-            self.engine_id,
-            local_block_ids,
+        local_block_descs_ids = self._compute_desc_ids(
+            block_ids=local_block_ids,
+            dst_num_blocks=self.dst_num_blocks[self.engine_id],
             block_size_ratio=block_size_ratio,
+            physical_blocks_per_logical=self._physical_blocks_per_logical_kv_block,
         )
 
         assert len(local_block_descs_ids) == len(remote_block_descs_ids)
@@ -2147,63 +2139,6 @@ def get_mapped_blocks(
 
         return mapped_2d.flatten().astype(np.int64)
 
-    def _get_block_descs_ids(
-        self,
-        engine_id: str,
-        block_ids: BlockIds,
-        block_size_ratio: float | None = None,
-    ) -> np.ndarray:
-        """
-        Get the descs ids for a set of block ids.
-        When HMA is enabled number of descriptors across kv cache groups might differ.
-        A single flattened array is returned for all groups anyway.
-        """
-        region_ids = np.arange(self.num_regions)
-
-        # NOTE (NickLucche) With HMA, every kv group has the same number of layers and
-        # layers from different groups share the same kv tensor.
-        # eg block_ids=[[1, 2], [3]]->blocks [1, 2] need to be read across all regions,
-        # same for [3], but group0-group1 blocks will always differ (different areas).
-        # Therefore we can just flatten the block_ids and compute the descs ids for all
-        # groups at once.
-        num_blocks = self.dst_num_blocks[engine_id]
-        if block_size_ratio is not None:
-            num_blocks = int(num_blocks * block_size_ratio)
-
-        # Compute desc ids per group using the right stride: FA descs have
-        # num_blocks entries per region (kernel granularity), SSM descs have
-        # logical_blocks entries per region (no kernel splitting).
-        region_ids = region_ids[:, None]
-        if not self._has_mamba:
-            block_ids = np.concatenate(block_ids)[None, :]
-            descs_ids = region_ids * num_blocks + block_ids
-            return descs_ids.flatten()
-        else:
-            # NOTE (NickLucche) SSM and Attention blocks regions can be exchanged
-            # arbitrarily by manager. Therefore, descs are duplicated for SSM and
-            # Attention like so:
-            # desc_handle->[descs_fa (all regions) | descs_ssm (all regions)].
-            # This is like having two "low-level views" of the same storage.
-            # `num_fa_descs` offset must be computed per-engine since P and D can
-            # have different num_blocks (and thus different FA descs counts).
-            physical_per_logical = self._physical_blocks_per_logical[engine_id]
-            logical_blocks = num_blocks // physical_per_logical
-            num_fa_descs = self.num_regions * num_blocks
-            # 3-read mamba: 4 regions per unique cache tensor (x, B, C, ssm).
-            mamba_region_ids = np.arange(len(self.block_len_per_layer) * 4)[:, None]
-            all_descs = []
-            for i, group in enumerate(block_ids):
-                group_arr = np.asarray(group)[None, :]
-                if self._is_mamba_group[i]:
-                    all_descs.append(
-                        (
-                            mamba_region_ids * logical_blocks + group_arr + num_fa_descs
-                        ).flatten()
-                    )
-                else:
-                    all_descs.append((region_ids * num_blocks + group_arr).flatten())
-            return np.concatenate(all_descs)
-
     def _logical_to_kernel_block_ids(self, block_ids: BlockIds) -> BlockIds:
         """
         Convert logical block ids to kernel physical block ids.
@@ -2295,10 +2230,7 @@ def get_backend_aware_kv_block_len(
         """
         assert self.transfer_topo is not None
         if self.transfer_topo.is_kv_layout_blocks_first:
-            # For indexing only half (either just the K or V part).
             if mamba_view:
-                # NOTE (NickLucche) Mamba Opt: this is already skipping the padding so
-                # we're only transferring the minimum required bytes.
                 block_len = self._mamba_ssm_size[not first_split]
             else:
                 block_len = self.block_len_per_layer[layer_idx] // 2
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
index 309426814c68..00b8e2bb7275 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/ssm_conv_transfer_utils.py
@@ -31,6 +31,7 @@ class MambaConvSplitInfo:
     x_local: int  # intermediate_size / TP  (columns for x)
     b_local: int  # groups_ss / TP  (columns for B; C is same size)
     conv_dtype_size: int  # bytes per element (e.g. 2 for float16)
+    ssm_sizes: tuple[int, int]  # (conv_state_bytes, ssm_state_bytes)
 
     @property
     def conv_dim_local(self) -> int:
@@ -99,8 +100,8 @@ def derive_mamba_conv_split(
         local_tp: this engine's tensor-parallel size.
 
     Returns:
-        MambaConvSplitInfo with per-rank x_local, b_local, conv_rows, and
-        conv_dtype_size.
+        MambaConvSplitInfo with per-rank x_local, b_local, conv_rows,
+        conv_dtype_size, and ssm_sizes (conv_state_bytes, ssm_state_bytes).
     """
     if mamba_spec.mamba_type != "mamba2":
         raise NotImplementedError(
@@ -142,12 +143,20 @@ def derive_mamba_conv_split(
         dtype=mamba_spec.dtypes[0],  # type: ignore[misc]
     ).element_size()
 
+    ssm_dtype_size = torch.tensor(
+        [],
+        dtype=mamba_spec.dtypes[1],  # type: ignore[misc]
+    ).element_size()
+    conv_state_bytes = torch.Size(mamba_spec.shapes[0]).numel() * conv_dtype_size
+    ssm_state_bytes = torch.Size(mamba_spec.shapes[1]).numel() * ssm_dtype_size
+
     # Divide by TP to get per-rank column counts.
     return MambaConvSplitInfo(
         conv_rows=conv_rows,
         x_local=intermediate_size // local_tp,
         b_local=groups_ss // local_tp,
         conv_dtype_size=conv_dtype_size,
+        ssm_sizes=(conv_state_bytes, ssm_state_bytes),
     )
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd9551003339..1b3803139217 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -962,7 +962,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "-dpn",
             type=int,
             help="Data parallel rank of this instance. "
-            "When set, enables external load balancer mode.",
+            "When set, enables external load balancer mode for MoE "
+            "data-parallel deployments. Unsupported for non-MoE models; "
+            "launch independent vLLM instances instead.",
         )
         parallel_group.add_argument(
             "--data-parallel-start-rank",
@@ -1697,29 +1699,15 @@ def create_engine_config(
             kv_offloading_backend=self.kv_offloading_backend,
         )
 
-        # TurboQuant: auto-skip first/last 2 layers (boundary protection).
-        # These layers are most sensitive to quantization error.
-        # Users can add extra layers via --kv-cache-dtype-skip-layers.
         if resolved_cache_dtype.startswith("turboquant_"):
-            if model_config.is_hybrid:
-                raise NotImplementedError(
-                    "TurboQuant KV cache is not supported for hybrid "
-                    "(attention + Mamba) models. Boundary layer protection "
-                    "requires uniform attention layers."
-                )
             from vllm.model_executor.layers.quantization.turboquant.config import (
                 TurboQuantConfig,
             )
 
-            num_layers = model_config.hf_text_config.num_hidden_layers
-            boundary = TurboQuantConfig.get_boundary_skip_layers(num_layers)
+            boundary = TurboQuantConfig.get_boundary_skip_layers(model_config)
             existing = set(cache_config.kv_cache_dtype_skip_layers)
-            merged = sorted(existing | set(boundary), key=lambda x: int(x))
-            cache_config.kv_cache_dtype_skip_layers = merged
-            logger.info(
-                "TQ: skipping layers %s for boundary protection (num_layers=%d)",
-                merged,
-                num_layers,
+            cache_config.kv_cache_dtype_skip_layers = sorted(
+                existing | set(boundary), key=int
             )
 
         ray_runtime_env = None
@@ -1793,6 +1781,16 @@ def create_engine_config(
         data_parallel_external_lb = (
             self.data_parallel_external_lb or self.data_parallel_rank is not None
         )
+        if (
+            self.data_parallel_size > 1
+            and data_parallel_external_lb
+            and not model_config.is_moe
+        ):
+            raise ValueError(
+                "Non-MoE models do not support external data parallel mode. "
+                "For external load balancing, launch independent vLLM "
+                "instances without --data-parallel-* arguments."
+            )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
             assert self.data_parallel_rank is not None, (
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9aac19e2fda5..da2ec10284c5 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -321,6 +321,21 @@ async def init_app_state(
     supported_tasks: tuple["SupportedTask", ...] | None = None,
 ) -> None:
     vllm_config = engine_client.vllm_config
+
+    # Propagate enable_in_reasoning to the API-server process. The engine core
+    # runs in a separate process, so the contextvar that backs
+    # `get_current_vllm_config_or_none()` is None on this stack. Tool parsers
+    # call `get_enable_structured_outputs_in_reasoning()` during request
+    # handling and need to see the real flag, otherwise they silently fall
+    # back to False and mismatch the engine-side bitmask gating.
+    from vllm.tool_parsers.structural_tag_registry import (
+        set_enable_structured_outputs_in_reasoning,
+    )
+
+    set_enable_structured_outputs_in_reasoning(
+        vllm_config.structured_outputs_config.enable_in_reasoning
+    )
+
     if supported_tasks is None:
         warnings.warn(
             "The 'supported_tasks' parameter was not provided to "
diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py
index c92cc13da01f..742f9cced6f9 100644
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
@@ -92,12 +92,16 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     # not part of the OpenAI spec but is useful for tracing the tokens
     # in agent scenarios
     token_ids: list[int] | None = None
+    routed_experts: list[list[list[int]]] | None = None  # [gen_len, num_layers, top_k]
 
 
 class ChatCompletionResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
     object: Literal["chat.completion"] = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
+    prompt_routed_experts: list[list[list[int]]] | None = (
+        None  # [prompt_len, num_layers, top_k]
+    )
     model: str
     choices: list[ChatCompletionResponseChoice]
     service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py
index 694ff80047c7..1026e0a1e3f7 100644
--- a/vllm/entrypoints/openai/chat_completion/serving.py
+++ b/vllm/entrypoints/openai/chat_completion/serving.py
@@ -70,10 +70,6 @@
 from vllm.renderers import ChatParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tool_parsers.streaming import (
-    extract_named_tool_call_streaming,
-    extract_required_tool_call_streaming,
-)
 from vllm.utils.collection_utils import as_list
 from vllm.utils.mistral import is_mistral_tokenizer, is_mistral_tool_parser
 
@@ -289,6 +285,7 @@ async def create_chat_completion(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
 
             sampling_params: SamplingParams | BeamSearchParams
@@ -388,23 +385,6 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
             return self.response_role
         return request.messages[-1]["role"]
 
-    def extract_tool_call_required_streaming(
-        self,
-        previous_text: str,
-        current_text: str | None,
-        delta_text: str,
-        function_name_returned: bool,
-        tool_call_idx: int | None = None,
-    ) -> tuple[DeltaMessage | None, bool]:
-        return extract_required_tool_call_streaming(
-            previous_text=previous_text,
-            current_text=current_text,
-            delta_text=delta_text,
-            function_name_returned=function_name_returned,
-            tool_call_idx=tool_call_idx,
-            tool_call_id_type=self.tool_call_id_type,
-        )
-
     async def chat_completion_stream_generator(
         self,
         request: ChatCompletionRequest,
@@ -447,22 +427,7 @@ async def chat_completion_stream_generator(
             and self._should_stream_with_auto_tool_parsing(request)
         )
 
-        # Determine whether required/named tool_choice should fall back to
-        # the auto tool_parser path instead of the standard JSON-based parsing.
-        # This happens when the parser declares supports_required_and_named=False
-        # (e.g. GLM models that output XML instead of JSON).
-        tool_choice_uses_parser = (
-            self.tool_parser is not None
-            and not self.tool_parser.supports_required_and_named
-            and request.tools
-            and (
-                request.tool_choice == "required"
-                or isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
-            )
-        )
-
         all_previous_token_ids: list[list[int]] | None
-        function_name_returned = [False] * num_choices
         if self.tool_call_id_type == "kimi_k2":
             history_tool_call_cnt = get_history_tool_calls_cnt(conversation)
         else:
@@ -476,10 +441,10 @@ async def chat_completion_stream_generator(
         if (
             is_mistral_grammar_path
             or tool_choice_auto
-            or tool_choice_uses_parser
+            or tool_choice_function_name
+            or request.tool_choice == "required"
             or reasoning_parser
         ):
-            # These are only required in "auto" tool choice case
             all_previous_token_ids = [[] for _ in range(num_choices)]
             reasoning_end_arr = [False] * num_choices
             prompt_is_reasoning_end_arr: list[bool | None] = [None] * num_choices
@@ -500,6 +465,10 @@ async def chat_completion_stream_generator(
                     )
                     for _ in range(num_choices)
                 ]
+                for p in parsers:
+                    if p is not None:
+                        p._stream_state.tool_call_id_type = self.tool_call_id_type
+                        p._stream_state.history_tool_call_cnt = history_tool_call_cnt
             else:
                 parsers = [None] * num_choices
         except Exception as e:
@@ -676,7 +645,8 @@ async def chat_completion_stream_generator(
                     if (
                         is_mistral_grammar_path
                         or tool_choice_auto
-                        or tool_choice_uses_parser
+                        or tool_choice_function_name
+                        or request.tool_choice == "required"
                         or reasoning_parser
                     ):
                         assert previous_texts is not None
@@ -730,135 +700,6 @@ async def chat_completion_stream_generator(
                         current_token_ids = result.current_token_ids
                         if result.tools_called:
                             tools_streamed[i] = True
-                    # handle streaming deltas for tools with named tool_choice
-                    # Skip when tool_choice_uses_parser so it falls through
-                    # to the auto tool_parser branches below.
-                    elif tool_choice_function_name and not tool_choice_uses_parser:
-                        # When encountering think end id in prompt_token_ids
-                        # i.e {"enable_thinking": False},
-                        # check BEFORE calling the parser to avoid a spurious
-                        # reasoning delta on the first chunk.
-                        if (
-                            reasoning_parser
-                            and not reasoning_end_arr[i]
-                            and prompt_is_reasoning_end_arr[i]
-                        ):
-                            reasoning_end_arr[i] = True
-
-                        if (
-                            reasoning_parser
-                            and not reasoning_end_arr[i]
-                            and not reasoning_parser.is_reasoning_end(
-                                previous_token_ids
-                            )
-                        ):
-                            assert reasoning_parser is not None
-                            delta_message = (
-                                reasoning_parser.extract_reasoning_streaming(
-                                    previous_text,
-                                    current_text,
-                                    delta_text,
-                                    previous_token_ids,
-                                    current_token_ids,
-                                    output.token_ids,
-                                )
-                            )
-                            # When encountering think end id in delta_token_ids,
-                            # set reasoning status to end.
-                            # Only keep 'content', remove 'reasoning'.
-                            if reasoning_parser.is_reasoning_end(
-                                as_list(output.token_ids)
-                            ):
-                                reasoning_end_arr[i] = True
-                                if delta_message and delta_message.content:
-                                    current_text = delta_message.content
-                                    delta_message.content = None
-                                else:
-                                    current_text = ""
-                        else:
-                            # Just to add remaining `content`
-                            if reasoning_parser:
-                                delta_text = previous_text + delta_text
-                                current_text = ""
-
-                            delta_message, function_name_returned[i] = (
-                                extract_named_tool_call_streaming(
-                                    delta_text=delta_text,
-                                    function_name=tool_choice_function_name,
-                                    function_name_returned=function_name_returned[i],
-                                    tool_call_idx=history_tool_call_cnt,
-                                    tool_call_id_type=self.tool_call_id_type,
-                                    tokenizer=tokenizer,
-                                    tool_call_array_index=i,
-                                )
-                            )
-                            if (
-                                delta_message
-                                and delta_message.tool_calls
-                                and delta_message.tool_calls[0].id is not None
-                            ):
-                                history_tool_call_cnt += 1
-                                tools_streamed[i] = True
-
-                    # Skip when tool_choice_uses_parser so it falls through
-                    # to the auto tool_parser branches below.
-                    elif (
-                        request.tool_choice == "required"
-                        and not tool_choice_uses_parser
-                    ):
-                        assert previous_texts is not None
-                        previous_text = previous_texts[i]
-                        current_text = previous_text + delta_text
-                        fn_name_returned = function_name_returned[i]
-                        output_token_ids = as_list(output.token_ids)
-
-                        if (
-                            reasoning_parser is not None
-                            and not reasoning_end_arr[i]
-                            and prompt_is_reasoning_end_arr[i]
-                        ):
-                            reasoning_end_arr[i] = True
-
-                        if reasoning_parser and not reasoning_end_arr[i]:
-                            delta_message = (
-                                reasoning_parser.extract_reasoning_streaming(
-                                    previous_text,
-                                    current_text,
-                                    delta_text,
-                                    previous_token_ids,
-                                    current_token_ids,
-                                    output_token_ids,
-                                )
-                            )
-                            if reasoning_parser.is_reasoning_end(output_token_ids):
-                                reasoning_end_arr[i] = True
-                                if delta_message and delta_message.content:
-                                    current_text = delta_message.content
-                                    delta_message.content = None
-                                else:
-                                    # reasoning ended
-                                    current_text = ""
-
-                        else:
-                            # either finished reasoning or no reasoning at all
-                            content = current_text
-
-                            delta_message, function_name_returned[i] = (
-                                self.extract_tool_call_required_streaming(
-                                    previous_text=previous_text,
-                                    current_text=content,
-                                    delta_text=delta_text,
-                                    function_name_returned=fn_name_returned,
-                                    tool_call_idx=history_tool_call_cnt,
-                                )
-                            )
-                            if (
-                                delta_message
-                                and delta_message.tool_calls
-                                and delta_message.tool_calls[0].id is not None
-                            ):
-                                history_tool_call_cnt += 1
-                                tools_streamed[i] = True
 
                     elif parser is not None:
                         delta_message = parser.parse_delta(
@@ -877,7 +718,8 @@ async def chat_completion_stream_generator(
                     if (
                         is_mistral_grammar_path
                         or tool_choice_auto
-                        or tool_choice_uses_parser
+                        or tool_choice_function_name
+                        or request.tool_choice == "required"
                         or reasoning_parser
                     ) and not self.use_harmony:
                         assert previous_texts is not None
@@ -1246,6 +1088,11 @@ async def chat_completion_full_generator(
                     token_ids=(
                         as_list(output.token_ids) if request.return_token_ids else None
                     ),
+                    routed_experts=(
+                        output.routed_experts.tolist()
+                        if output.routed_experts is not None
+                        else None
+                    ),
                 )
                 choices.append(choice_data)
                 continue
@@ -1307,8 +1154,8 @@ async def chat_completion_full_generator(
                 request.tool_choice
                 and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam
             ):
-                assert tool_calls is not None and len(tool_calls) > 0
                 tool_call_class_items = []
+                tool_calls = tool_calls or []
                 for idx, tc in enumerate(tool_calls):
                     # Use native ID if available (e.g., Kimi K2),
                     # otherwise generate ID with correct id_type
@@ -1467,6 +1314,11 @@ async def chat_completion_full_generator(
                 token_ids=(
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
+                routed_experts=(
+                    output.routed_experts.tolist()
+                    if output.routed_experts is not None
+                    else None
+                ),
             )
             choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 
@@ -1506,6 +1358,10 @@ async def chat_completion_full_generator(
 
         request_metadata.final_usage_info = usage
 
+        prompt_routed_experts = None
+        if final_res.prompt_routed_experts is not None:
+            prompt_routed_experts = final_res.prompt_routed_experts.tolist()
+
         response = ChatCompletionResponse(
             id=request_id,
             created=created_time,
@@ -1518,6 +1374,7 @@ async def chat_completion_full_generator(
                 final_res.prompt_token_ids if request.return_token_ids else None
             ),
             kv_transfer_params=final_res.kv_transfer_params,
+            prompt_routed_experts=prompt_routed_experts,
         )
 
         # Log complete response if output logging is enabled
diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py
index 7edea50d73aa..7bb3c8d8a0db 100644
--- a/vllm/entrypoints/openai/completion/protocol.py
+++ b/vllm/entrypoints/openai/completion/protocol.py
@@ -468,12 +468,16 @@ class CompletionResponseChoice(OpenAIBaseModel):
     token_ids: list[int] | None = None  # For response
     prompt_logprobs: list[dict[int, Logprob] | None] | None = None
     prompt_token_ids: list[int] | None = None  # For prompt
+    routed_experts: list[list[list[int]]] | None = None  # [gen_len, num_layers, top_k]
 
 
 class CompletionResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: Literal["text_completion"] = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
+    prompt_routed_experts: list[list[list[int]]] | None = (
+        None  # [prompt_len, num_layers, top_k]
+    )
     model: str
     choices: list[CompletionResponseChoice]
     service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py
index 454b170a5fa5..ee4ca9f3ada3 100644
--- a/vllm/entrypoints/openai/completion/serving.py
+++ b/vllm/entrypoints/openai/completion/serving.py
@@ -151,6 +151,7 @@ async def create_completion(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
 
             sampling_params: SamplingParams | BeamSearchParams
@@ -541,6 +542,11 @@ def request_output_to_completion_response(
                     token_ids=(
                         as_list(output.token_ids) if request.return_token_ids else None
                     ),
+                    routed_experts=(
+                        output.routed_experts.tolist()
+                        if output.routed_experts is not None
+                        else None
+                    ),
                 )
                 choices.append(choice_data)
 
@@ -564,8 +570,13 @@ def request_output_to_completion_response(
             )
 
         request_metadata.final_usage_info = usage
+        prompt_routed_experts = None
         if final_res_batch:
             kv_transfer_params = final_res_batch[0].kv_transfer_params
+            pre = final_res_batch[0].prompt_routed_experts
+            if pre is not None:
+                prompt_routed_experts = pre.tolist()
+
         return CompletionResponse(
             id=request_id,
             created=created_time,
@@ -574,6 +585,7 @@ def request_output_to_completion_response(
             usage=usage,
             system_fingerprint=self.system_fingerprint,
             kv_transfer_params=kv_transfer_params,
+            prompt_routed_experts=prompt_routed_experts,
         )
 
     def _create_completion_logprobs(
diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py
index 5e4cc3482801..f0f84a82204c 100644
--- a/vllm/entrypoints/openai/engine/serving.py
+++ b/vllm/entrypoints/openai/engine/serving.py
@@ -638,8 +638,9 @@ def _parse_tool_calls_from_content(
             and request.tool_choice
             and isinstance(request.tool_choice, ToolChoiceFunction)
         ):
-            assert content is not None
             # Forced Function Call (Responses API)
+            if content is None:
+                return [], None
             function_calls.append(
                 FunctionCall(name=request.tool_choice.name, arguments=content)
             )
@@ -651,7 +652,8 @@ def _parse_tool_calls_from_content(
             and (tool_parser_cls is None or tool_parser_cls.supports_required_and_named)
         ):
             # Named function with standard JSON-based parsing
-            assert content is not None
+            if content is None:
+                return [], None
             function_calls.append(
                 FunctionCall(name=request.tool_choice.function.name, arguments=content)
             )
diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py
index 96876e3f00f8..b5d69ea1cccc 100644
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -23,7 +23,9 @@
     ResponseOutputItem,
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
+    ResponseOutputMessage,
     ResponsePrompt,
+    ResponseReasoningItem,
     ResponseReasoningTextDeltaEvent,
     ResponseReasoningTextDoneEvent,
     ResponseStatus,
@@ -451,18 +453,21 @@ def check_cache_salt_support(cls, data):
 
     @model_validator(mode="before")
     @classmethod
-    def function_call_parsing(cls, data):
-        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
-        This ensures Pydantic can properly resolve union types in the input field.
-        Function calls provided as dicts are converted to ResponseFunctionToolCall
-        objects before validation, while invalid structures are left for Pydantic
-        to reject with appropriate error messages.
-        """
+    def input_item_parsing(cls, data):
+        """Parse input items that are missing required fields or that Pydantic
+        cannot disambiguate in a Union of TypedDict / BaseModel types.
+
+        Specifically handles:
+        - function_call -> ResponseFunctionToolCall
+        - reasoning     -> ResponseReasoningItem (auto-generates id)
+        - message(role=assistant) -> ResponseOutputMessage (auto-generates
+          id/status and annotations)
 
+        Invalid structures are left for Pydantic to reject.
+        """
         input_data = data.get("input")
 
         # Early return for None, strings, or bytes
-        # (strings are iterable but shouldn't be processed)
         if input_data is None or isinstance(input_data, (str, bytes)):
             return data
 
@@ -476,16 +481,61 @@ def function_call_parsing(cls, data):
 
         processed_input = []
         for item in input_data:
-            if isinstance(item, dict) and item.get("type") == "function_call":
+            if not isinstance(item, dict):
+                processed_input.append(item)
+                continue
+
+            item_type = item.get("type")
+
+            if item_type == "function_call":
                 try:
                     processed_input.append(ResponseFunctionToolCall(**item))
                 except ValidationError:
-                    # Let Pydantic handle validation for malformed function calls
                     logger.debug(
                         "Failed to parse function_call to ResponseFunctionToolCall, "
                         "leaving for Pydantic validation"
                     )
                     processed_input.append(item)
+
+            elif item_type == "reasoning":
+                if "id" not in item:
+                    item = {**item, "id": f"rs_{random_uuid()}"}
+                try:
+                    processed_input.append(ResponseReasoningItem(**item))
+                except ValidationError:
+                    logger.debug(
+                        "Failed to parse reasoning to ResponseReasoningItem, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+
+            elif item_type == "message" and item.get("role") == "assistant":
+                item = dict(item)
+                if "id" not in item:
+                    item["id"] = f"msg_{random_uuid()}"
+                if "status" not in item:
+                    item["status"] = "completed"
+                # ResponseOutputText requires annotations
+                if isinstance(item.get("content"), list):
+                    new_content = []
+                    for c in item["content"]:
+                        if (
+                            isinstance(c, dict)
+                            and c.get("type") == "output_text"
+                            and "annotations" not in c
+                        ):
+                            c = {**c, "annotations": []}
+                        new_content.append(c)
+                    item["content"] = new_content
+                try:
+                    processed_input.append(ResponseOutputMessage(**item))
+                except ValidationError:
+                    logger.debug(
+                        "Failed to parse assistant message to ResponseOutputMessage, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+
             else:
                 processed_input.append(item)
 
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index a6a9355aea88..9c4dc48589ff 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -416,6 +416,9 @@ async def create_responses(
                 self._extract_prompt_len(engine_input),
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=(
+                    -1 if request.truncation != "disabled" else None
+                ),
             )
 
             sampling_params = request.to_sampling_params(
@@ -700,6 +703,9 @@ async def _generate_with_builtin_tools(
                     self._extract_prompt_len(engine_input),
                     self.default_sampling_params,  # type: ignore
                     self.override_max_tokens,  # type: ignore
+                    truncate_prompt_tokens=(
+                        -1 if context.request.truncation != "disabled" else None
+                    ),
                 )
 
             # OPTIMIZATION
diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py
index fd38598f7c79..d85d2372387f 100644
--- a/vllm/entrypoints/pooling/embed/serving.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -19,6 +19,7 @@
     encode_pooling_bytes,
     encode_pooling_output_base64,
     encode_pooling_output_float,
+    encode_pooling_output_float_or_ndarray,
     get_json_response_cls,
 )
 from .io_processor import EmbedIOProcessor
@@ -104,6 +105,40 @@ def _openai_json_response(
         embed_dtype: EmbedDType,
         endianness: Endianness,
     ) -> JSONResponse:
+        use_ndarray_response = (
+            encoding_format == "float"
+            and self.json_response_cls.__name__ == "ORJSONResponse"
+        )
+        if use_ndarray_response:
+            ndarray_items: list[dict[str, object]] = []
+            ndarray_num_tokens = 0
+
+            for idx, final_res in enumerate(final_res_batch):
+                item_dict = EmbeddingResponseData(
+                    index=idx,
+                    embedding=[],
+                ).model_dump()
+                item_dict["embedding"] = encode_pooling_output_float_or_ndarray(
+                    final_res
+                )
+                ndarray_items.append(item_dict)
+                ndarray_num_tokens += len(final_res.prompt_token_ids)
+
+            ndarray_usage = UsageInfo(
+                prompt_tokens=ndarray_num_tokens,
+                total_tokens=ndarray_num_tokens,
+            )
+            ndarray_response = EmbeddingResponse(
+                id=request_id,
+                created=created_time,
+                model=model_name,
+                data=[],  # type: ignore[arg-type]
+                usage=ndarray_usage,
+            ).model_dump()
+            ndarray_response["data"] = ndarray_items
+
+            return self.json_response_cls(content=ndarray_response)
+
         encode_fn = cast(
             Callable[[PoolingRequestOutput], list[float] | str],
             (
diff --git a/vllm/entrypoints/pooling/utils.py b/vllm/entrypoints/pooling/utils.py
index 329a4d189692..2036ed7aae8e 100644
--- a/vllm/entrypoints/pooling/utils.py
+++ b/vllm/entrypoints/pooling/utils.py
@@ -62,6 +62,17 @@ def encode_pooling_output_float(output: PoolingRequestOutput) -> list[float]:
     return output.outputs.data.tolist()
 
 
+def encode_pooling_output_float_or_ndarray(output: PoolingRequestOutput) -> Any:
+    """Return an ndarray when the response renderer can serialize NumPy."""
+    try:
+        data = output.outputs.data
+        if not data.is_contiguous():
+            data = data.contiguous()
+        return data.numpy()
+    except (RuntimeError, TypeError):
+        return output.outputs.data.tolist()
+
+
 def encode_pooling_output_base64(
     output: PoolingRequestOutput,
     embed_dtype: EmbedDType,
diff --git a/vllm/entrypoints/serve/render/serving.py b/vllm/entrypoints/serve/render/serving.py
index 967899229ada..782b2eaea24b 100644
--- a/vllm/entrypoints/serve/render/serving.py
+++ b/vllm/entrypoints/serve/render/serving.py
@@ -164,6 +164,7 @@ async def render_chat_request(
             input_length,
             self.default_sampling_params,
             self.override_max_tokens,
+            truncate_prompt_tokens=request.truncate_prompt_tokens,
         )
         params = request.to_sampling_params(max_tokens, self.default_sampling_params)
 
@@ -298,6 +299,7 @@ async def render_completion_request(
                 input_length,
                 self.default_sampling_params,
                 self.override_max_tokens,
+                truncate_prompt_tokens=request.truncate_prompt_tokens,
             )
             params = request.to_sampling_params(
                 max_tokens, self.default_sampling_params
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index e3682280ec50..cd1010457d98 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -177,7 +177,14 @@ def get_max_tokens(
     input_length: int,
     default_sampling_params: dict,
     override_max_tokens: int | None = None,
+    truncate_prompt_tokens: int | None = None,
 ) -> int:
+    if truncate_prompt_tokens is not None:
+        limit = truncate_prompt_tokens
+        input_length = min(
+            input_length,
+            max_model_len if limit == -1 else limit,
+        )
     if max_model_len < input_length:
         raise ValueError(
             f"Input length ({input_length}) exceeds model's maximum "
diff --git a/vllm/env_override.py b/vllm/env_override.py
index bbd76fe4ab71..78270c2bee37 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -112,6 +112,13 @@ def _maybe_set_cuda_compatibility_path():
 # in the environment.
 os.environ.setdefault("TRITON_CACHE_AUTOTUNING", "1")
 
+# When unset, TileLang routes JIT temp dirs through a world-shared
+# /tmp/tvm-debug-mode-tempdirs/ whose ownership is pinned to whichever
+# user compiled first, breaking every other user on a shared host.
+# Opt into per-process tempdirs unless the user explicitly chose the
+# debug layout (see https://github.com/vllm-project/vllm/issues/41410).
+os.environ.setdefault("TILELANG_CLEANUP_TEMP_FILES", "1")
+
 # ===================================================
 # torch 2.9 Inductor PythonWrapperCodegen monkeypatch
 # ===================================================
diff --git a/vllm/envs.py b/vllm/envs.py
index 24ec92c3d755..ded474dc085a 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -143,7 +143,7 @@
     VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
     VLLM_USE_STANDALONE_COMPILE: bool = True
-    VLLM_ENABLE_PREGRAD_PASSES: bool = False
+    VLLM_ENABLE_PREGRAD_PASSES: bool = True
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
@@ -226,6 +226,7 @@
     VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS: bool = False
     VLLM_SYSTEM_START_DATE: str | None = None
     VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY: bool = False
+    VLLM_ENFORCE_STRICT_TOOL_CALLING: bool = False
     VLLM_CUSTOM_SCOPES_FOR_PROFILING: bool = False
     VLLM_NVTX_SCOPES_FOR_PROFILING: bool = False
     VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES: bool = True
@@ -245,7 +246,7 @@
     VLLM_DEBUG_WORKSPACE: bool = False
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
-    VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 4096
+    VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD: int = 1024
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
     VLLM_LOG_MODEL_INSPECTION: bool = False
@@ -265,6 +266,7 @@
     VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = True
     VLLM_NIXL_EP_MAX_NUM_RANKS: int = 32
     VLLM_XPU_ENABLE_XPU_GRAPH: bool = False
+    VLLM_XPU_USE_SAMPLER_KERNEL: bool = True
     VLLM_LORA_ENABLE_DUAL_STREAM: bool = False
 
 
@@ -620,9 +622,10 @@ def _get_or_set_default() -> str:
     # The pre-grad passes get run even on cache-hit and negatively impact
     # vllm cold compile times by O(1s)
     # Can remove this after the following issue gets fixed
+    # TODO(luka): maybe_inplace requires this
     # https://github.com/pytorch/pytorch/issues/174502
     "VLLM_ENABLE_PREGRAD_PASSES": lambda: (
-        os.environ.get("VLLM_ENABLE_PREGRAD_PASSES", "0") == "1"
+        os.environ.get("VLLM_ENABLE_PREGRAD_PASSES", "1") == "1"
     ),
     # Debug pattern matching inside custom passes.
     # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3').
@@ -780,9 +783,8 @@ def _get_or_set_default() -> str:
     ),
     # When True and distributed_executor_backend="ray", use RayExecutorV2
     # (MQ-based) instead of RayDistributedExecutor (compiled-graph backend).
-    # TODO (jeffreywang): Enabled by default in vLLM 0.20.0.
     "VLLM_USE_RAY_V2_EXECUTOR_BACKEND": lambda: bool(
-        int(os.getenv("VLLM_USE_RAY_V2_EXECUTOR_BACKEND", "0"))
+        int(os.getenv("VLLM_USE_RAY_V2_EXECUTOR_BACKEND", "1"))
     ),
     # Use dedicated multiprocess context for workers.
     # Both spawn and fork work
@@ -1230,8 +1232,8 @@ def _get_or_set_default() -> str:
     # if 1, force use indexed gemm
     # if 0, force use grouped gemm
     # if None, choose better gemm type automatically
-    "VLLM_HUMMING_MOE_GEMM_TYPE": lambda: maybe_convert_bool(
-        os.environ.get("VLLM_HUMMING_MOE_GEMM_TYPE", None)
+    "VLLM_HUMMING_MOE_GEMM_TYPE": lambda: os.environ.get(
+        "VLLM_HUMMING_MOE_GEMM_TYPE", None
     ),
     # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
     # only supported on Blackwell GPUs and with
@@ -1592,6 +1594,12 @@ def _get_or_set_default() -> str:
     "VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY": lambda: bool(
         int(os.getenv("VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY", "0"))
     ),
+    # When 1,the model structural tags will be used to enforce the model
+    # output conforming to the model's tool-calling format and schema.
+    # Default 0 (off).
+    "VLLM_ENFORCE_STRICT_TOOL_CALLING": lambda: bool(
+        int(os.getenv("VLLM_ENFORCE_STRICT_TOOL_CALLING", "0"))
+    ),
     # Add optional custom scopes for profiling, disable to avoid overheads
     "VLLM_CUSTOM_SCOPES_FOR_PROFILING": lambda: bool(
         int(os.getenv("VLLM_CUSTOM_SCOPES_FOR_PROFILING", "0"))
@@ -1685,10 +1693,10 @@ def _get_or_set_default() -> str:
     # tokens the FP8 main GEMM has idle SMs to share with the bf16 aux GEMMs
     # and overlap is a 5-45% win; above it the FP8 GEMM saturates the device
     # and the cross-stream sync becomes pure overhead. Set to 0 to disable
-    # the multi-stream path entirely. Empirical crossover on B300 (148 SMs)
-    # is ~4096; B200 (132 SMs) is expected ~3072.
+    # the multi-stream path entirely. See #PR 41526 for the empirical result
+    # for the default value of 1024 tokens.
     "VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD": lambda: int(
-        os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "4096")
+        os.getenv("VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD", "1024")
     ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
@@ -1768,6 +1776,10 @@ def _get_or_set_default() -> str:
     "VLLM_XPU_ENABLE_XPU_GRAPH": lambda: bool(
         int(os.getenv("VLLM_XPU_ENABLE_XPU_GRAPH", "0"))
     ),
+    # whether use xpu specific sample kernel
+    "VLLM_XPU_USE_SAMPLER_KERNEL": lambda: bool(
+        int(os.getenv("VLLM_XPU_USE_SAMPLER_KERNEL", "1"))
+    ),
     # Enable simple KV offload.
     "VLLM_USE_SIMPLE_KV_OFFLOAD": lambda: bool(
         int(os.getenv("VLLM_USE_SIMPLE_KV_OFFLOAD", "0"))
diff --git a/vllm/ir/op.py b/vllm/ir/op.py
index 5d7c01be1bbc..841df0f9adf4 100644
--- a/vllm/ir/op.py
+++ b/vllm/ir/op.py
@@ -4,7 +4,7 @@
 import inspect
 from collections.abc import Callable
 from pathlib import Path
-from typing import Any, ClassVar, overload
+from typing import Any, ClassVar, Literal, overload
 
 import torch
 from torch.library import Library, infer_schema
@@ -46,35 +46,51 @@ def enable_torch_wrap(enable: bool = True):
         _ENABLE_TORCH_WRAP = old
 
 
-# 0-param decorator overload
+# 0-param decorator overload (no inplace)
 @overload
 def register_op(f: Callable[..., Any]) -> "IrOp": ...
 
 
-# parametrized decorator overload
+# parametrized decorator with allow_inplace=False (default)
 @overload
 def register_op(
     *,
     name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: Literal[False] = False,
 ) -> Callable[[Callable[..., Any]], "IrOp"]: ...
 
 
+# parametrized decorator with allow_inplace=True
+@overload
+def register_op(
+    *,
+    name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: Literal[True],
+) -> Callable[[Callable[..., Any]], "IrOpInplace"]: ...
+
+
 def register_op(
     f: Callable | None = None,
     *,
     name: str | None = None,
+    activations: list[str] | None = None,
+    allow_inplace: bool = False,
 ) -> "IrOp | Callable[[Callable], IrOp]":
     """
     Register a new vLLM IR op.
 
     :param f: the native implementation of the op
     :param name: the name of the op, defaults to the function name
+    :param activations: list of activation params, defaults to params starting with 'x'
+    :param allow_inplace: add a maybe_inplace overload that allows inplace impls
     :return: the IrOp object if f is provided, otherwise a decorator
 
     Example usage:
     ```python
     @vllm.ir.register_op
-    def my_op(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    def my_add(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         return x + y
 
 
@@ -85,7 +101,10 @@ def multiply(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
     def decorator(_f: Callable):
         op_name: str = _f.__name__ if name is None else name
         assert op_name not in IrOp.registry
-        op = IrOp(op_name, _f)
+        if allow_inplace:
+            op: IrOp = IrOpInplace(op_name, _f, activations)
+        else:
+            op = IrOp(op_name, _f, activations)
         IrOp.registry[op_name] = op
         return op
 
@@ -100,8 +119,14 @@ class IrOp:
 
     name: str
     impls: dict[str, "IrOpImpl"]
+    allow_inplace: bool = False
 
-    def __init__(self, name: str, native_impl: Callable):
+    def __init__(
+        self,
+        name: str,
+        native_impl: Callable,
+        activations: list[str] | None = None,
+    ):
         self._py_signature = inspect.signature(native_impl)
         if any(
             p.kind == inspect.Parameter.KEYWORD_ONLY
@@ -112,8 +137,22 @@ def __init__(self, name: str, native_impl: Callable):
                 f"supported. That's because kwargs are not allowed during lowering."
             )
 
+        # By convention, we consider parameters starting with 'x' as activations.
+        if activations is None:
+            activations = [
+                p.name
+                for p in self._py_signature.parameters.values()
+                if p.name.startswith("x")
+            ]
+
         self.name = name
         self.impls: dict[str, IrOpImpl] = {}
+        self.activations = activations
+        self.activation_indices = [
+            i
+            for i, p in enumerate(self._py_signature.parameters.values())
+            if p.name in activations
+        ]
         self._priority_impls: list[IrOpImpl] = []
         self._schema_str = infer_schema(native_impl, mutates_args=[])
         self._input_generator: InputGenerator | None = None
@@ -121,7 +160,12 @@ def __init__(self, name: str, native_impl: Callable):
 
         # native implementation
         self.impls["native"] = IrOpImpl(
-            self, "native", native_impl, supported=True, supports_args=None
+            self,
+            "native",
+            native_impl,
+            # always supported
+            supported=True,
+            supports_args=None,
         )
 
         # By default, fake routes directly to native,
@@ -161,12 +205,14 @@ def register_impl(
         *,
         supported: bool = True,
         supports_args: Callable[..., bool] | None = None,
+        inplace: bool = False,
     ):
         """
         Register an implementation for this custom op.
         :param provider: The name of the provider, must be unique.
         :param supported: Static support check, use this to check platform support.
         :param supports_args: Dynamic arg support check, used for types and shapes.
+        :param inplace: Does this op reuse activation input memory for outputs
         :return: A decorator that registers the implementation.
 
         The decorated function must have the same semantics and signature as
@@ -193,7 +239,7 @@ def my_provider_impl(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: ...
         )
 
         def _register_impl(f: Callable):
-            impl = IrOpImpl(self, provider, f, supported, supports_args)
+            impl = IrOpImpl(self, provider, f, supported, supports_args, inplace)
             self.impls[provider] = impl
 
             if self.get_priority():
@@ -213,7 +259,10 @@ def _inner_call(self, *args, **kwargs) -> Any:
         __call__ routes straight here instead of going through torch op dispatching.
         """
         impl = self.dispatch(*args, **kwargs)
-        return impl.impl_fn(*args, **kwargs)
+
+        # Default overload must be functional,
+        # use func_impl_fn to correctly handle inplace impls.
+        return impl.func_impl_fn(*args, **kwargs)
 
     def apply_arg_defaults(self, args) -> tuple:
         """
@@ -314,6 +363,11 @@ def filter_priority_impls(p_list: list[str]) -> list[IrOpImpl]:
         old_priority_impls = self._priority_impls
         try:
             self._priority_impls = filter_priority_impls(priority)
+            logger.debug(
+                "Priority for vllm.ir.%s set to %s",
+                self.name,
+                lazy(lambda: [p.provider for p in self._priority_impls]),
+            )
             yield
         finally:
             self._priority_impls = old_priority_impls
@@ -354,6 +408,66 @@ def get_tolerance(self, dtype: torch.dtype) -> dict[str, float]:
         )
 
 
+class IrOpInplace(IrOp):
+    """IR op with inplace support via maybe_inplace."""
+
+    maybe_inplace: "IrOpInplaceOverload"
+    allow_inplace: bool = True
+
+    def __init__(
+        self,
+        name: str,
+        native_impl: Callable,
+        activations: list[str] | None = None,
+    ):
+        super().__init__(name, native_impl, activations)
+
+        # Create the inplace overload
+        self.maybe_inplace = IrOpInplaceOverload(self)
+
+
+class IrOpInplaceOverload:
+    def __init__(self, op: IrOp):
+        params, returns = op._schema_str.split(" -> ")
+        n_outputs = returns.count("Tensor")
+
+        assert returns.count("Tensor") == len(op.activations), (
+            "Inplace overload requires the same number of outputs as activations."
+        )
+
+        assert returns.count(",") == n_outputs - 1, (
+            "Inplace overload only supports Tensor outputs for now."
+        )
+
+        self.op = op
+        self.name = f"{op.name}.maybe_inplace"
+        self._schema_str = infer_schema(
+            op.impls["native"].impl_fn, mutates_args=op.activations
+        )
+
+        # torch registration
+        vllm_ir_lib.define(self.name + self._schema_str)
+        vllm_ir_lib.impl(
+            self.name, self._inner_call, dispatch_key="CompositeExplicitAutograd"
+        )
+        # fake goes to default overload for now
+        vllm_ir_lib._register_fake(self.name, self.op._fake_call)
+
+        assert hasattr(getattr(torch.ops.vllm_ir, self.op.name), "maybe_inplace")
+        self.torch_op = getattr(torch.ops.vllm_ir, self.op.name).maybe_inplace
+
+    def __call__(self, *args, **kwargs) -> Any:
+        if not _ENABLE_TORCH_WRAP:
+            return self._inner_call(*args, **kwargs)
+
+        return self.torch_op(*args, **kwargs)
+
+    def _inner_call(self, *args, **kwargs) -> Any:
+        # Calling the maybe_inplace overload means we can use inplace impls directly.
+        impl = self.op.dispatch(*args, **kwargs)
+        return impl.impl_fn(*args, **kwargs)
+
+
 class IrOpImpl:
     def __init__(
         self,
@@ -362,6 +476,7 @@ def __init__(
         impl_fn: Callable,
         supported: bool,
         supports_args: Callable[..., bool] | None,
+        inplace: bool = False,
     ):
         assert provider not in op.impls, (
             f"Implementation for provider {provider} already registered."
@@ -420,11 +535,18 @@ def __init__(
                         f"native default {op_p.default}'"
                     )
 
+        if inplace:
+            assert op.allow_inplace, (
+                f"Inplace implementation cannot be registered for op {op.name}"
+                f" that does not allow inplace."
+            )
+
         self.op = op
         self.provider = provider
         self.impl_fn = impl_fn
         self.supported = supported
         self._supports_args = supports_args
+        self.inplace = inplace
 
     @property
     def supports_all_args(self) -> bool:
@@ -449,3 +571,19 @@ def uuid(self):
         """
         sources = [Path(inspect.getfile(self.impl_fn))]
         return hash_source(*sources)
+
+    def func_impl_fn(self, *args, **kwargs) -> Any:
+        """
+        Copy any inputs in activations if this is an inplace impl,
+        to ensure functional semantics.
+        """
+        if not self.inplace:
+            return self.impl_fn(*args, **kwargs)
+
+        # copy activations to ensure functional semantics
+        new_args = list(args)
+        for i in self.op.activation_indices:
+            assert isinstance(args[i], torch.Tensor)
+            new_args[i] = args[i].clone()
+
+        return self.impl_fn(*new_args, **kwargs)
diff --git a/vllm/ir/ops/__init__.py b/vllm/ir/ops/__init__.py
index 25ad27c8a078..d4d71afef723 100644
--- a/vllm/ir/ops/__init__.py
+++ b/vllm/ir/ops/__init__.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from .layernorm import rms_norm
+from .layernorm import fused_add_rms_norm, rms_norm
 
-__all__ = ["rms_norm"]
+__all__ = ["rms_norm", "fused_add_rms_norm"]
diff --git a/vllm/ir/ops/layernorm.py b/vllm/ir/ops/layernorm.py
index 981d5e3bd836..33a71b8f853f 100644
--- a/vllm/ir/ops/layernorm.py
+++ b/vllm/ir/ops/layernorm.py
@@ -27,10 +27,46 @@ def _rms_norm_input_generator(
 ) -> tuple:
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     weight = torch.randn(hidden_size, dtype=dtype)
-    return (x, weight, epsilon)
+    return x, weight, epsilon
 
 
 # Reductions in rms_norm accumulate rounding error at large shapes
 # (e.g. 32768x16384), causing a few elements out of millions to exceed
 # the default float16 tolerance.
 rms_norm.override_tolerance(torch.float16, atol=1e-2, rtol=2e-3)
+
+
+@register_op(allow_inplace=True)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    """Fused add and weighted root-mean-square layer normalization"""
+    orig_dtype = x.dtype
+    x = x.to(torch.float32)
+    x = x + x_residual.to(torch.float32)
+    x_residual = x.to(orig_dtype)
+
+    x_var = x if variance_size is None else x[..., :variance_size]
+    variance = x_var.pow(2).mean(dim=-1, keepdim=True)
+    x = x * torch.rsqrt(variance + epsilon)
+    if weight is not None:
+        x = x.to(weight.dtype) * weight
+    return x.to(orig_dtype), x_residual
+
+
+# fused_add_rms_norm has similar rounding error accumulation as rms_norm
+fused_add_rms_norm.override_tolerance(torch.float16, atol=1e-2, rtol=2e-3)
+
+
+@fused_add_rms_norm.register_input_generator
+def _fused_add_rms_norm_input_generator(
+    num_tokens: int, hidden_size: int, dtype: torch.dtype, epsilon: float = 1e-5
+) -> tuple:
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x_residual = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    weight = torch.randn(hidden_size, dtype=dtype)
+    return x, x_residual, weight, epsilon
diff --git a/vllm/kernels/aiter_ops.py b/vllm/kernels/aiter_ops.py
index 14c2b87fbbdb..273bc58935b7 100644
--- a/vllm/kernels/aiter_ops.py
+++ b/vllm/kernels/aiter_ops.py
@@ -75,3 +75,72 @@ def _rms_norm_fake(x: Tensor, weight: Tensor, variance_epsilon: float) -> Tensor
 direct_register_aiter_op(
     op_name="rms_norm", op_func=_rms_norm_impl, fake_impl=_rms_norm_fake
 )
+
+rms_add_no_var_16bit_only = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and x.dtype in (torch.float16, torch.bfloat16)
+    and (weight is None or weight.dtype == x.dtype)
+)
+"""
+AITER fused_add_rms_norm only supports 16-bit activations and no var_size override.
+Requires weight dtype to match x dtype.
+"""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "aiter", supports_args=rms_add_no_var_16bit_only, supported=AITER_SUPPORTED
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    assert variance_size is None
+    assert x.dtype in (torch.float16, torch.bfloat16)
+    if weight is None:
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+    return torch.ops.vllm_aiter.fused_add_rms_norm(x, x_residual, weight, epsilon)
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_impl(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import rmsnorm2d_fwd_with_add
+
+    # TODO can out = x and residual_out = residual to save memory?
+    #  Need to check if the kernel supports in-place residual output
+    #  (if yes set mutates_args and inplace)
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    rmsnorm2d_fwd_with_add(
+        out,  # output
+        x,  # input
+        residual,  # residual input
+        residual_out,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return out, residual_out
+
+
+def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    variance_epsilon: float,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    residual_out = torch.empty_like(residual)
+    out = torch.empty_like(x)
+    return out, residual_out
+
+
+direct_register_aiter_op(
+    op_name="fused_add_rms_norm",
+    op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl,
+    fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake,
+)
diff --git a/vllm/kernels/helion/__init__.py b/vllm/kernels/helion/__init__.py
index 2568baa20dae..8c05c428bb07 100644
--- a/vllm/kernels/helion/__init__.py
+++ b/vllm/kernels/helion/__init__.py
@@ -3,11 +3,13 @@
 """Helion integration for vLLM."""
 
 import vllm.kernels.helion.ops  # noqa: F401  Auto-register all Helion ops
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.kernels.helion.config_manager import (
     ConfigManager,
     ConfigSet,
 )
 from vllm.kernels.helion.register import (
+    ConfigPicker,
     ConfiguredHelionKernel,
     HelionKernelWrapper,
     get_kernel_by_name,
@@ -19,9 +21,11 @@
 
 __all__ = [
     # Config management
+    "CaseKey",
     "ConfigManager",
     "ConfigSet",
     # Kernel registration
+    "ConfigPicker",
     "ConfiguredHelionKernel",
     "HelionKernelWrapper",
     "get_kernel_by_name",
diff --git a/vllm/kernels/helion/case_key.py b/vllm/kernels/helion/case_key.py
new file mode 100644
index 000000000000..32b544de39cc
--- /dev/null
+++ b/vllm/kernels/helion/case_key.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Structured key for identifying kernel config/autotune/benchmark cases.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+class CaseKey(dict[str, Any]):
+    """Immutable, hashable dict for identifying kernel cases.
+
+    Used as the key for config lookup, autotuning, benchmarking, and
+    input generation.  Behaves like a read-only dict and can be used
+    as a dict key or in sets.
+
+    The canonical string form (``__str__``) is stable JSON with sorted
+    keys.  Use ``CaseKey.default()`` for the default/fallback key.
+    The regular constructor requires at least one key-value pair::
+
+        CaseKey({"intermediate": 2048, "numtokens": 256})
+        CaseKey.default()  # default/fallback
+    """
+
+    def __init__(self, *args: Any, _allow_empty: bool = False, **kwargs: Any):
+        super().__init__(*args, **kwargs)
+        if not self and not _allow_empty:
+            raise TypeError(
+                "CaseKey requires at least one key-value pair. "
+                "Use CaseKey.default() for the default config key."
+            )
+        self._str: str | None = None
+        self._hash: int | None = None
+
+    @classmethod
+    def default(cls) -> CaseKey:
+        """Create a default case key (empty)."""
+        return cls(_allow_empty=True)
+
+    def __hash__(self) -> int:  # type: ignore[override]
+        if self._hash is None:
+            self._hash = hash(str(self))
+        return self._hash
+
+    def __str__(self) -> str:
+        if self._str is None:
+            self._str = json.dumps(dict(self), sort_keys=True, separators=(",", ":"))
+        return self._str
+
+    def __repr__(self) -> str:
+        if not self:
+            return "CaseKey.default()"
+        return f"CaseKey({dict(self)})"
+
+    def is_default(self) -> bool:
+        """Return True if this is the default case key (empty)."""
+        return not self
+
+    def _readonly(self, *args: Any, **kwargs: Any) -> Any:
+        raise TypeError("CaseKey is immutable")
+
+    __setitem__ = _readonly  # type: ignore[assignment]
+    __delitem__ = _readonly  # type: ignore[assignment]
+    __ior__ = _readonly  # type: ignore[assignment]
+    update = _readonly  # type: ignore[assignment]
+    pop = _readonly  # type: ignore[assignment]
+    popitem = _readonly  # type: ignore[assignment]
+    setdefault = _readonly  # type: ignore[assignment]
+    clear = _readonly  # type: ignore[assignment]
diff --git a/vllm/kernels/helion/config_manager.py b/vllm/kernels/helion/config_manager.py
index f34d936041f4..ca37a68e8101 100644
--- a/vllm/kernels/helion/config_manager.py
+++ b/vllm/kernels/helion/config_manager.py
@@ -11,25 +11,25 @@
 Each kernel has a directory: {kernel_name}/
 Inside, each GPU platform has its own JSON file: {kernel_name}/{platform}.json
 
-For example:
-    silu_mul_fp8/
-        nvidia_h100.json    # { "default": {...}, "batch_32_hidden_4096": {...} }
-        nvidia_h200.json    # { "batch_16_hidden_2048": {...} }
-
-Each platform file maps config keys to Helion config objects.
-Config keys should be structured strings that encode the relevant
-parameters (e.g., "batch_32_hidden_4096", "seq_512_heads_16", "fp8_batch_64", etc.).
-
-Classes
--------
-- ConfigSet: In-memory collection of configs for a kernel with lookup/query APIs.
-- ConfigManager: File-level operations for config persistence.
+Platform files store config entries as a JSON array::
+
+    [
+        {"key": {}, "config": {...}},
+        {"key": {"intermediate": 2048, "numtokens": 256}, "config": {...}},
+        ...,
+    ]
+
+Config keys are ``CaseKey`` instances mapping parameter names to
+values.  The default config uses ``CaseKey.default()``.
 """
 
+from __future__ import annotations
+
 import json
 from pathlib import Path
 from typing import Any
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.logger import init_logger
 from vllm.utils.import_utils import has_helion
 
@@ -45,11 +45,13 @@
 
 
 class ConfigSet:
-    """In-memory collection of Helion configs with lookup/query capabilities."""
+    """In-memory collection of Helion configs with lookup/query capabilities.
 
-    # Type alias for nested config structure:
-    # platform -> config_key -> helion.Config
-    _ConfigDict = dict[str, dict[str, "helion.Config"]]
+    Configs are stored keyed by ``CaseKey``.  The default config
+    uses ``CaseKey.default()`` as its key.
+    """
+
+    _ConfigDict = dict[str, dict[CaseKey, "helion.Config"]]
 
     def __init__(self, kernel_name: str):
         self._kernel_name = kernel_name
@@ -59,7 +61,7 @@ def __init__(self, kernel_name: str):
     def kernel_name(self) -> str:
         return self._kernel_name
 
-    def get_config(self, platform: str, config_key: str) -> helion.Config:
+    def get_config(self, platform: str, config_key: CaseKey) -> helion.Config:
         platform_dict = self._configs.get(platform)
         if platform_dict is None:
             avail_platforms = self.get_platforms()
@@ -82,7 +84,8 @@ def get_config(self, platform: str, config_key: str) -> helion.Config:
             avail_keys = self.get_config_keys(platform)
             raise KeyError(
                 f"Config not found for kernel '{self._kernel_name}': "
-                f"config_key '{config_key}' not found for platform '{platform}'. "
+                f"config_key '{config_key}' not found for "
+                f"platform '{platform}'. "
                 f"Available config_keys: {avail_keys or '(none)'}"
             )
 
@@ -91,25 +94,34 @@ def get_config(self, platform: str, config_key: str) -> helion.Config:
     def get_platforms(self) -> list[str]:
         return sorted(self._configs.keys())
 
-    def get_config_keys(self, platform: str) -> list[str]:
+    def get_config_keys(self, platform: str) -> list[CaseKey]:
         platform_dict = self._configs.get(platform.lower())
         if platform_dict is None:
             return []
-        return sorted(platform_dict.keys())
-
-    def to_dict(self) -> dict[str, Any]:
-        result: dict[str, Any] = {}
-
-        for platform, config_keys_dict in self._configs.items():
-            result[platform] = {}
-
-            for config_key, config in config_keys_dict.items():
-                result[platform][config_key] = json.loads(config.to_json())
+        return sorted(platform_dict.keys(), key=str)
+
+    def to_config_entries(self) -> dict[str, list[dict[str, Any]]]:
+        """Serialize to config entries format for JSON output."""
+        result: dict[str, list[dict[str, Any]]] = {}
+        for platform, config_dict in self._configs.items():
+            pairs: list[dict[str, Any]] = []
+            for config_key, config in config_dict.items():
+                config_data = json.loads(config.to_json())
+                pairs.append({"key": dict(config_key), "config": config_data})
+            result[platform] = pairs
+        return result
 
+    def to_dict(self) -> dict[str, dict[CaseKey, Any]]:
+        """Return configs as a nested dict (platform -> key -> config)."""
+        result: dict[str, dict[CaseKey, Any]] = {}
+        for platform, config_dict in self._configs.items():
+            result[platform] = {
+                k: json.loads(v.to_json()) for k, v in config_dict.items()
+            }
         return result
 
     @classmethod
-    def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> "ConfigSet":
+    def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> ConfigSet:
         config_set = cls(kernel_name)
         count = 0
 
@@ -117,9 +129,11 @@ def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> "ConfigSet":
             if platform not in config_set._configs:
                 config_set._configs[platform] = {}
 
-            for config_key, config_data in platform_data.items():
-                config = helion.Config(**config_data)
-                config_set._configs[platform][config_key] = config
+            for entry in platform_data:
+                raw_key = entry["key"]
+                key = CaseKey.default() if not raw_key else CaseKey(raw_key)
+                config = helion.Config(**entry["config"])
+                config_set._configs[platform][key] = config
                 count += 1
 
         if count > 0:
@@ -132,7 +146,10 @@ def from_dict(cls, kernel_name: str, data: dict[str, Any]) -> "ConfigSet":
         return config_set
 
     def set_config(
-        self, platform: str, config_key: str, config: "helion.Config"
+        self,
+        platform: str,
+        config_key: CaseKey,
+        config: helion.Config,
     ) -> None:
         platform = platform.lower()
         if platform not in self._configs:
@@ -145,7 +162,7 @@ def set_config(
             config_key,
         )
 
-    def has_config(self, platform: str, config_key: str) -> bool:
+    def has_config(self, platform: str, config_key: CaseKey) -> bool:
         platform = platform.lower()
         platform_dict = self._configs.get(platform)
         if platform_dict is None:
@@ -156,18 +173,18 @@ def has_config(self, platform: str, config_key: str) -> bool:
 class ConfigManager:
     """File-level configuration management for Helion kernels (global singleton)."""
 
-    _instance: "ConfigManager | None" = None
+    _instance: ConfigManager | None = None
     _instance_base_dir: Path | None = None
 
-    def __new__(cls, base_dir: str | Path | None = None) -> "ConfigManager":
+    def __new__(cls, base_dir: str | Path | None = None) -> ConfigManager:
         resolved_base_dir = cls._resolve_base_dir(base_dir)
 
         if cls._instance is not None:
             if cls._instance_base_dir != resolved_base_dir:
                 raise ValueError(
                     f"ConfigManager singleton already exists with base_dir "
-                    f"'{cls._instance_base_dir}', cannot create with different "
-                    f"base_dir '{resolved_base_dir}'"
+                    f"'{cls._instance_base_dir}', cannot create with "
+                    f"different base_dir '{resolved_base_dir}'"
                 )
             return cls._instance
 
@@ -190,7 +207,7 @@ def _resolve_base_dir(base_dir: str | Path | None) -> Path:
         return (Path(__file__).parent / "configs").resolve()
 
     @classmethod
-    def get_instance(cls) -> "ConfigManager":
+    def get_instance(cls) -> ConfigManager:
         if cls._instance is None:
             raise RuntimeError(
                 "ConfigManager instance has not been created. "
@@ -229,16 +246,16 @@ def ensure_base_dir_writable(self) -> None:
                 f"Config directory '{self._base_dir}' is not writable: {e}"
             ) from e
 
-    def _load_platform_file(self, kernel_name: str, platform: str) -> dict[str, Any]:
+    def _load_platform_file(self, kernel_name: str, platform: str) -> Any:
         config_path = self.get_config_file_path(kernel_name, platform)
         if not config_path.exists():
-            return {}
+            return []
         try:
             with open(config_path) as f:
                 return json.load(f)
         except (json.JSONDecodeError, OSError) as e:
             logger.error("Failed to load config file %s: %s", config_path, e)
-            return {}
+            return []
 
     def load_config_set(self, kernel_name: str) -> ConfigSet:
         kernel_dir = self.get_kernel_dir(kernel_name)
@@ -253,32 +270,36 @@ def load_config_set(self, kernel_name: str) -> ConfigSet:
                     platform_data = json.load(f)
                 data[platform] = platform_data
             except (json.JSONDecodeError, OSError) as e:
-                logger.error("Failed to load config file %s: %s", platform_file, e)
+                logger.error(
+                    "Failed to load config file %s: %s",
+                    platform_file,
+                    e,
+                )
 
         return ConfigSet.from_dict(kernel_name, data)
 
     def get_platform_configs(
         self, kernel_name: str, platform: str
-    ) -> dict[str, helion.Config]:
+    ) -> dict[CaseKey, helion.Config]:
         platform_data = self._load_platform_file(kernel_name, platform)
         if not platform_data:
             return {}
         config_set = ConfigSet.from_dict(kernel_name, {platform: platform_data})
-        config_keys = config_set.get_config_keys(platform)
         return {
-            config_key: config_set.get_config(platform, config_key)
-            for config_key in config_keys
+            k: config_set.get_config(platform, k)
+            for k in config_set.get_config_keys(platform)
         }
 
     def save_config_set(self, config_set: ConfigSet) -> Path:
         kernel_dir = self.get_kernel_dir(config_set.kernel_name)
         kernel_dir.mkdir(parents=True, exist_ok=True)
 
-        full_data = config_set.to_dict()
-        for platform, platform_data in full_data.items():
+        full_data = config_set.to_config_entries()
+        for platform, pairs in full_data.items():
             platform_path = kernel_dir / f"{platform}.json"
             with open(platform_path, "w") as f:
-                json.dump(platform_data, f, indent=2)
+                json.dump(pairs, f, indent=2)
+                f.write("\n")
             logger.info("Saved config to: %s", platform_path)
 
         return kernel_dir
@@ -287,21 +308,34 @@ def save_configs(
         self,
         kernel_name: str,
         platform: str,
-        configs: dict[str, "helion.Config"],
+        configs: dict[CaseKey, helion.Config],
     ) -> Path:
         """Save configs for a kernel/platform, merging with existing."""
-        platform_data = self._load_platform_file(kernel_name, platform)
-        for config_key, config in configs.items():
-            platform_data[config_key] = json.loads(config.to_json())
+        config_set = ConfigSet.from_dict(
+            kernel_name,
+            {platform: self._load_platform_file(kernel_name, platform)},
+        )
+        for key, config in configs.items():
+            config_set.set_config(platform, key, config)
 
+        pairs = config_set.to_config_entries().get(platform, [])
         platform_path = self.get_config_file_path(kernel_name, platform)
         platform_path.parent.mkdir(parents=True, exist_ok=True)
         with open(platform_path, "w") as f:
-            json.dump(platform_data, f, indent=2)
+            json.dump(pairs, f, indent=2)
+            f.write("\n")
 
         logger.info("Saved config to: %s", platform_path)
         return platform_path
 
-    def config_exists(self, kernel_name: str, platform: str, config_key: str) -> bool:
+    def config_exists(
+        self,
+        kernel_name: str,
+        platform: str,
+        config_key: CaseKey,
+    ) -> bool:
         platform_data = self._load_platform_file(kernel_name, platform)
-        return config_key in platform_data
+        if not platform_data:
+            return False
+        target = dict(config_key)
+        return any(entry["key"] == target for entry in platform_data)
diff --git a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json
index c314eb2dab86..4dc5c2cab308 100644
--- a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json
+++ b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h100.json
@@ -1,13866 +1,15711 @@
-{
-  "intermediate_2048_numtokens_256": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_256": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "default": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
+[
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {},
+    "config": {
+      "block_sizes": [
+        1,
+        512
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 7688,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        1
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "persistent_interleaved",
+      "num_sm_multiplier": 32,
+      "maxnreg": 32
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        2
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        2
+      ],
+      "range_multi_buffers": [
+        false
+      ],
+      "range_flattens": [
+        true
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "persistent_blocked",
+      "num_sm_multiplier": 2,
+      "maxnreg": 64
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_256": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_256": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_256": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_7688_numtokens_256": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_256": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_1": {
-    "block_sizes": [
-      1,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_2": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_2": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_14336_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_4096_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_4": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_4": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_14336_numtokens_4": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_2048_numtokens_8": {
-    "block_sizes": [
-      8,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_8": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_4096_numtokens_8": {
-    "block_sizes": [
-      8,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_8": {
-    "block_sizes": [
-      2,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_8": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_8": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_16": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_2880_numtokens_16": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_16": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_16": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_16": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_16": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        1
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_24": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_24": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_24": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_24": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_24": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        2
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_24": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_32": {
-    "block_sizes": [
-      32,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_32": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_32": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_32": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_32": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_32": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_40": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_40": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_40": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_40": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_40": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_40": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      1
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "persistent_interleaved",
-    "num_sm_multiplier": 32,
-    "maxnreg": 32
-  },
-  "intermediate_2048_numtokens_48": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_48": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_48": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_48": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_48": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_48": {
-    "block_sizes": [
-      32,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_56": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_56": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_56": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_56": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_56": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_56": {
-    "block_sizes": [
-      2,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_64": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_64": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_64": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_64": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_64": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_64": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_72": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_72": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_72": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_72": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_72": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_72": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_80": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_80": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_80": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_80": {
-    "block_sizes": [
-      4,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_80": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_80": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_88": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_88": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_88": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_88": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_88": {
-    "block_sizes": [
-      16,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_88": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_96": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_96": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_96": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_96": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_96": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_96": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_104": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_104": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_104": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_104": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_104": {
-    "block_sizes": [
-      2,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_104": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_112": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_112": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_112": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_112": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_112": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_112": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_120": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_120": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_120": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_120": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_120": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_120": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_128": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_128": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_128": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_128": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_128": {
-    "block_sizes": [
-      2,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_128": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_136": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_136": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_136": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_136": {
-    "block_sizes": [
-      2,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_136": {
-    "block_sizes": [
-      4,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_136": {
-    "block_sizes": [
-      4,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_144": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_144": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_144": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_144": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_144": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_144": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_152": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_152": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_152": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_152": {
-    "block_sizes": [
-      64,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_152": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_152": {
-    "block_sizes": [
-      2,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_160": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_160": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_160": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_160": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_160": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_160": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_168": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_168": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_168": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_168": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_168": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_168": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_176": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_176": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_176": {
-    "block_sizes": [
-      128,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_176": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_176": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_176": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_184": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_184": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_184": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_184": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_184": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_184": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_192": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_192": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_192": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_192": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_192": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_192": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_200": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_200": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_200": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_200": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_200": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_200": {
-    "block_sizes": [
-      16,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_208": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_208": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_208": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_208": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_208": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_208": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_216": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_216": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_216": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_216": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_216": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_216": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_224": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_224": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_224": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_224": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_224": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_224": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_232": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_232": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_232": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_232": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_232": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_232": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_240": {
-    "block_sizes": [
-      64,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_240": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_240": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_248": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_248": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_248": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_248": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_248": {
-    "block_sizes": [
-      4,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_248": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_272": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_272": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_272": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_272": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_272": {
-    "block_sizes": [
-      8,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_272": {
-    "block_sizes": [
-      512,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_288": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_288": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_288": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_288": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_288": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_288": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_304": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_304": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      2
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      2
-    ],
-    "range_multi_buffers": [
-      false
-    ],
-    "range_flattens": [
-      true
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "persistent_blocked",
-    "num_sm_multiplier": 2,
-    "maxnreg": 64
-  },
-  "intermediate_4096_numtokens_304": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_304": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_304": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_304": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_320": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_320": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_320": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_320": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_320": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_320": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_336": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_336": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_336": {
-    "block_sizes": [
-      16,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_336": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_336": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_336": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_352": {
-    "block_sizes": [
-      512,
-      1
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_352": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_352": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_352": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_352": {
-    "block_sizes": [
-      16,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_352": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_368": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_368": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_368": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_368": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_368": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_368": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_384": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_384": {
-    "block_sizes": [
-      512,
-      2
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_384": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_384": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_384": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
         1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_384": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_400": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_400": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_400": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_400": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_400": {
-    "block_sizes": [
-      2,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_400": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_416": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_416": {
-    "block_sizes": [
-      32,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_416": {
-    "block_sizes": [
-      512,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_416": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_416": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_416": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_432": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_432": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_432": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_432": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_432": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_432": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_448": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_448": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_448": {
-    "block_sizes": [
-      8,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_448": {
-    "block_sizes": [
-      128,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_448": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_448": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_464": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_464": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_464": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_464": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_464": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_464": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_480": {
-    "block_sizes": [
-      16,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_480": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_480": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_480": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_480": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_480": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_496": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_496": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_496": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_496": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_496": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_496": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_512": {
-    "block_sizes": [
-      512,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_512": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_512": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_512": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_512": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_512": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
   }
-}
\ No newline at end of file
+]
diff --git a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json
index c314eb2dab86..4dc5c2cab308 100644
--- a/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json
+++ b/vllm/kernels/helion/configs/silu_mul_fp8/nvidia_h200.json
@@ -1,13866 +1,15711 @@
-{
-  "intermediate_2048_numtokens_256": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_256": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "default": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
+[
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {},
+    "config": {
+      "block_sizes": [
+        1,
+        512
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 7688,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 256
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 1
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 2
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 4
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 8
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "xyz"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 16
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 24
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 32
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 40
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        1
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "persistent_interleaved",
+      "num_sm_multiplier": 32,
+      "maxnreg": 32
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 48
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 56
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 64
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 72
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 80
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 88
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 96
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 104
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 112
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 120
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 128
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 136
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 144
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 152
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 160
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 168
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 176
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 184
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 192
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 200
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 208
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 216
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 224
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 232
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 240
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 248
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 272
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 288
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        2
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        2
+      ],
+      "range_multi_buffers": [
+        false
+      ],
+      "range_flattens": [
+        true
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "persistent_blocked",
+      "num_sm_multiplier": 2,
+      "maxnreg": 64
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_256": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_256": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_256": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_7688_numtokens_256": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_256": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_1": {
-    "block_sizes": [
-      1,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_1": {
-    "block_sizes": [
-      1,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_2": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_2": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 304
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_14336_numtokens_2": {
-    "block_sizes": [
-      2,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_4096_numtokens_4": {
-    "block_sizes": [
-      4,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_4": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_4": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_14336_numtokens_4": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_2048_numtokens_8": {
-    "block_sizes": [
-      8,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_8": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_4096_numtokens_8": {
-    "block_sizes": [
-      8,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_8": {
-    "block_sizes": [
-      2,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_8": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_8": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_16": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "xyz"
-  },
-  "intermediate_2880_numtokens_16": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_16": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_16": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_16": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_16": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 320
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 336
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        1
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 352
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_24": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_24": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_24": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_24": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_24": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 368
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        2
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
         1,
+        8192
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 384
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_24": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_32": {
-    "block_sizes": [
-      32,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_32": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_32": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_32": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_32": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_32": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_40": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_40": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_40": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_40": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_40": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_40": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      1
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "persistent_interleaved",
-    "num_sm_multiplier": 32,
-    "maxnreg": 32
-  },
-  "intermediate_2048_numtokens_48": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_48": {
-    "block_sizes": [
-      16,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_48": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_48": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_48": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_48": {
-    "block_sizes": [
-      32,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_56": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_56": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_56": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_56": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_56": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_56": {
-    "block_sizes": [
-      2,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_64": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_64": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_64": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_64": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_64": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_64": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_72": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_72": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_72": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_72": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_72": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_72": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_80": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_80": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_80": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_80": {
-    "block_sizes": [
-      4,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_80": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_80": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_88": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_88": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_88": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_88": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_88": {
-    "block_sizes": [
-      16,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_88": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_96": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_96": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_96": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_96": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_96": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        2,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 400
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_96": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_104": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_104": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_104": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_104": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_104": {
-    "block_sizes": [
-      2,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_104": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_112": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_112": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_112": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_112": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_112": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_112": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_120": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_120": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_120": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_120": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_120": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_120": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_128": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_128": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_128": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_128": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_128": {
-    "block_sizes": [
-      2,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_128": {
-    "block_sizes": [
-      4,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_136": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_136": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_136": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_136": {
-    "block_sizes": [
-      2,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_136": {
-    "block_sizes": [
-      4,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_136": {
-    "block_sizes": [
-      4,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_144": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_144": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_144": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_144": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        32,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_144": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_144": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_152": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_152": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_152": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_152": {
-    "block_sizes": [
-      64,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_152": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_152": {
-    "block_sizes": [
-      2,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_160": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_160": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_160": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_160": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_160": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_160": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_168": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_168": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_168": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_168": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_168": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_168": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_176": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_176": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_176": {
-    "block_sizes": [
-      128,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_176": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_176": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_176": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_184": {
-    "block_sizes": [
-      2,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_184": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_184": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_184": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_184": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_184": {
-    "block_sizes": [
-      64,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_192": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_192": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_192": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_192": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_192": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_192": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_200": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_200": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_200": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_200": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_200": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 8,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 416
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 5,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "first"
+      ],
+      "num_warps": 1,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 432
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        4
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_200": {
-    "block_sizes": [
-      16,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_208": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_208": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_208": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_208": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_208": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_208": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_216": {
-    "block_sizes": [
-      32,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_216": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_216": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_216": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_216": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_216": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_224": {
-    "block_sizes": [
-      32,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_224": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_224": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_224": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_224": {
-    "block_sizes": [
-      32,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_224": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_232": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_232": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_232": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_232": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_232": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 6,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_232": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_240": {
-    "block_sizes": [
-      64,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_240": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        8
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
         1,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 448
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        16
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_240": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_240": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_248": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_248": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_248": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_248": {
-    "block_sizes": [
-      256,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_248": {
-    "block_sizes": [
-      4,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_248": {
-    "block_sizes": [
-      8,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_272": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_272": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_272": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 32,
+      "num_stages": 8,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 1,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_272": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_272": {
-    "block_sizes": [
-      8,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_272": {
-    "block_sizes": [
-      512,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_288": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_288": {
-    "block_sizes": [
-      8,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_288": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_288": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_288": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_288": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_304": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_304": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      2
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      2
-    ],
-    "range_multi_buffers": [
-      false
-    ],
-    "range_flattens": [
-      true
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "persistent_blocked",
-    "num_sm_multiplier": 2,
-    "maxnreg": 64
-  },
-  "intermediate_4096_numtokens_304": {
-    "block_sizes": [
-      16,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_304": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_304": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_304": {
-    "block_sizes": [
-      4,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_320": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_320": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_320": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_320": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_320": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_320": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_336": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_336": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_336": {
-    "block_sizes": [
-      16,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_336": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_336": {
-    "block_sizes": [
-      4,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_336": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_352": {
-    "block_sizes": [
-      512,
-      1
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_352": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_352": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_352": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_352": {
-    "block_sizes": [
-      16,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_352": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_368": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_368": {
-    "block_sizes": [
-      128,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_368": {
-    "block_sizes": [
-      64,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_368": {
-    "block_sizes": [
-      2,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_368": {
-    "block_sizes": [
-      128,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_368": {
-    "block_sizes": [
-      32,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_384": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_384": {
-    "block_sizes": [
-      512,
-      2
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_384": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_384": {
-    "block_sizes": [
-      128,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_384": {
-    "block_sizes": [
-      1,
-      8192
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
         1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_384": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_400": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_400": {
-    "block_sizes": [
-      16,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_400": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 6,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 464
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        512
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 32,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        16,
+        32
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "first",
+        ""
+      ],
+      "num_warps": 16,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 5,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
+        64,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        8
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 2,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_400": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "first",
+        ""
+      ],
+      "num_warps": 1,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        1024
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_400": {
-    "block_sizes": [
-      2,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_400": {
-    "block_sizes": [
-      4,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_416": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_416": {
-    "block_sizes": [
-      32,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_416": {
-    "block_sizes": [
-      512,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_416": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 4,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 480
+    },
+    "config": {
+      "block_sizes": [
         1,
+        16384
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 8,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_416": {
-    "block_sizes": [
-      256,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_416": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_432": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_432": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_432": {
-    "block_sizes": [
-      64,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_432": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 5,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_432": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "last",
+        "first"
+      ],
+      "num_warps": 32,
+      "num_stages": 3,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        256
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 8,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "first"
-    ],
-    "num_warps": 1,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_432": {
-    "block_sizes": [
-      512,
-      4
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_448": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_448": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        256,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        4
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "last",
+        "last"
+      ],
+      "num_warps": 8,
+      "num_stages": 4,
+      "indexing": [
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 496
+    },
+    "config": {
+      "block_sizes": [
+        4,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "first"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2048,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        512,
+        16
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 6,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_448": {
-    "block_sizes": [
-      8,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_448": {
-    "block_sizes": [
-      128,
-      8
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_448": {
-    "block_sizes": [
-      1,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_448": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      16
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 32,
-    "num_stages": 8,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_464": {
-    "block_sizes": [
-      256,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_464": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_464": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 1,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_464": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_464": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 6,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_464": {
-    "block_sizes": [
-      64,
-      512
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 32,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_480": {
-    "block_sizes": [
-      16,
-      32
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "first",
-      ""
-    ],
-    "num_warps": 16,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_480": {
-    "block_sizes": [
-      128,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 5,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_480": {
-    "block_sizes": [
-      64,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      8
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 2,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_480": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "first",
-      ""
-    ],
-    "num_warps": 1,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_480": {
-    "block_sizes": [
-      1,
-      1024
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 4,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_480": {
-    "block_sizes": [
-      1,
-      16384
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "last",
-      "first"
-    ],
-    "num_warps": 32,
-    "num_stages": 3,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_496": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 2880,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        2048
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        ""
+      ],
+      "num_warps": 8,
+      "num_stages": 1,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 4096,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        8,
+        128
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        2
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "last",
+        "last",
+        "last"
+      ],
+      "num_warps": 16,
+      "num_stages": 2,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 8192,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
         1,
+        2048
+      ],
+      "loop_orders": [
+        [
+          1,
+          0
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        64
+      ],
+      "range_unroll_factors": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_496": {
-    "block_sizes": [
-      8,
-      256
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 8,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_496": {
-    "block_sizes": [
-      256,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_496": {
-    "block_sizes": [
-      256,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_496": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      4
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "last",
-      "last"
-    ],
-    "num_warps": 8,
-    "num_stages": 4,
-    "indexing": [
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_496": {
-    "block_sizes": [
-      4,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "first"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2048_numtokens_512": {
-    "block_sizes": [
-      512,
-      16
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_2880_numtokens_512": {
-    "block_sizes": [
-      8,
-      2048
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      ""
-    ],
-    "num_warps": 8,
-    "num_stages": 1,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_4096_numtokens_512": {
-    "block_sizes": [
-      8,
-      128
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      2
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "last",
-      "last",
-      "last"
-    ],
-    "num_warps": 16,
-    "num_stages": 2,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_8192_numtokens_512": {
-    "block_sizes": [
-      1,
-      2048
-    ],
-    "loop_orders": [
-      [
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "",
+        "last"
+      ],
+      "num_warps": 4,
+      "num_stages": 4,
+      "indexing": [
+        "pointer",
+        "pointer",
+        "pointer",
+        "pointer"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 11008,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
         1,
+        4096
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        false
+      ],
+      "l2_groupings": [
+        1
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
+        0
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "first",
+        "",
+        "first"
+      ],
+      "num_warps": 16,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "pointer",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
+  },
+  {
+    "key": {
+      "intermediate": 14336,
+      "numtokens": 512
+    },
+    "config": {
+      "block_sizes": [
+        128,
+        64
+      ],
+      "loop_orders": [
+        [
+          0,
+          1
+        ]
+      ],
+      "flatten_loops": [
+        true
+      ],
+      "l2_groupings": [
+        32
+      ],
+      "range_unroll_factors": [
+        0
+      ],
+      "range_warp_specializes": [],
+      "range_num_stages": [
         0
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      64
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "",
-      "last"
-    ],
-    "num_warps": 4,
-    "num_stages": 4,
-    "indexing": [
-      "pointer",
-      "pointer",
-      "pointer",
-      "pointer"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_11008_numtokens_512": {
-    "block_sizes": [
-      1,
-      4096
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      false
-    ],
-    "l2_groupings": [
-      1
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "first",
-      "",
-      "first"
-    ],
-    "num_warps": 16,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "pointer",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
-  },
-  "intermediate_14336_numtokens_512": {
-    "block_sizes": [
-      128,
-      64
-    ],
-    "loop_orders": [
-      [
-        0,
-        1
-      ]
-    ],
-    "flatten_loops": [
-      true
-    ],
-    "l2_groupings": [
-      32
-    ],
-    "range_unroll_factors": [
-      0
-    ],
-    "range_warp_specializes": [],
-    "range_num_stages": [
-      0
-    ],
-    "range_multi_buffers": [
-      null
-    ],
-    "range_flattens": [
-      null
-    ],
-    "load_eviction_policies": [
-      "",
-      "first",
-      ""
-    ],
-    "num_warps": 2,
-    "num_stages": 7,
-    "indexing": [
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor",
-      "tensor_descriptor"
-    ],
-    "pid_type": "flat"
+      ],
+      "range_multi_buffers": [
+        null
+      ],
+      "range_flattens": [
+        null
+      ],
+      "load_eviction_policies": [
+        "",
+        "first",
+        ""
+      ],
+      "num_warps": 2,
+      "num_stages": 7,
+      "indexing": [
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor",
+        "tensor_descriptor"
+      ],
+      "pid_type": "flat"
+    }
   }
-}
\ No newline at end of file
+]
diff --git a/vllm/kernels/helion/ops/silu_mul_fp8.py b/vllm/kernels/helion/ops/silu_mul_fp8.py
index 1399b15d0092..e092efccc1ec 100644
--- a/vllm/kernels/helion/ops/silu_mul_fp8.py
+++ b/vllm/kernels/helion/ops/silu_mul_fp8.py
@@ -1,11 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from __future__ import annotations
+
 from typing import Any
 
-import regex as re
 import torch
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.logger import init_logger
 from vllm.utils.import_utils import has_helion
 
@@ -22,14 +24,14 @@
 logger = init_logger(__name__)
 
 
-def generate_silu_mul_fp8_inputs() -> dict[str, tuple[Any, ...]]:
+def generate_silu_mul_fp8_inputs() -> dict[CaseKey, tuple[Any, ...]]:
     intermediate_sizes = [2048, 2880, 4096, 8192, 11008, 14336]
 
     # Use the same num_tokens values as vLLM's default cudagraph capture sizes.
     # See vllm/config/vllm.py _set_cudagraph_sizes() for the canonical formula.
     num_tokens_list = [1, 2, 4] + list(range(8, 256, 8)) + list(range(256, 513, 16))
 
-    inputs = {}
+    inputs: dict[CaseKey, tuple[Any, ...]] = {}
     for num_tokens in num_tokens_list:
         for intermediate_size in intermediate_sizes:
             input_tensor = torch.randn(
@@ -40,15 +42,18 @@ def generate_silu_mul_fp8_inputs() -> dict[str, tuple[Any, ...]]:
             )
             scale = torch.tensor([1.0], device="cuda", dtype=torch.float32)
 
-            config_key = f"intermediate_{intermediate_size}_numtokens_{num_tokens}"
-            inputs[config_key] = (input_tensor, scale)
+            key = CaseKey({"intermediate": intermediate_size, "numtokens": num_tokens})
+            inputs[key] = (input_tensor, scale)
 
     return inputs
 
 
+_pick_cache: dict[tuple[int, int], CaseKey | None] = {}
+
+
 def pick_silu_mul_fp8_config(
-    args: tuple[Any, ...], config_keys: list[str]
-) -> str | None:
+    args: tuple[Any, ...], config_keys: list[CaseKey]
+) -> CaseKey | None:
     """Pick the best pre-tuned config for the given input shape.
 
     Selection strategy:
@@ -57,39 +62,35 @@ def pick_silu_mul_fp8_config(
       2. Among the num_tokens values tuned for that intermediate_size, pick
          the smallest num_tokens >= the input's num_tokens. If the input is
          larger than all available num_tokens, fall back to the largest.
-
-    Config keys must be "default" or follow the format
-    "intermediate_{int}_numtokens_{int}".
     """
     if not config_keys:
         return None
 
     input_tensor, _scale = args
-    intermediate_size = input_tensor.shape[-1] // 2
-    num_tokens = input_tensor.view(-1, input_tensor.shape[-1]).shape[0]
-    configs: dict[int, list[int]] = {}
-    for key in config_keys:
-        if key == "default":
+    intermediate_size = int(input_tensor.shape[-1]) // 2
+    num_tokens = int(input_tensor.view(-1, input_tensor.shape[-1]).shape[0])
+
+    cache_key = (num_tokens, intermediate_size)
+    cached = _pick_cache.get(cache_key)
+    if cached is not None:
+        return cached
+
+    by_isize: dict[int, list[int]] = {}
+    for k in config_keys:
+        if k.is_default():
             continue
-        match = re.fullmatch(r"intermediate_(\d+)_numtokens_(\d+)", key)
-        if not match:
-            raise ValueError(
-                f"Malformed config key '{key}', "
-                f"expected format 'intermediate_{{int}}_numtokens_{{int}}'"
-            )
-        isize_str, ntokens_str = match.groups()
-        configs.setdefault(int(isize_str), []).append(int(ntokens_str))
+        by_isize.setdefault(k["intermediate"], []).append(k["numtokens"])
 
-    if not configs:
-        return "default" if "default" in config_keys else None
+    if not by_isize:
+        return None
 
-    best_isize = min(configs, key=lambda s: abs(s - intermediate_size))
-    available_ntokens = sorted(configs[best_isize])
-    best_ntokens = next(
-        (n for n in available_ntokens if n >= num_tokens), available_ntokens[-1]
-    )
+    best_isize = min(by_isize, key=lambda s: abs(s - intermediate_size))
+    available = sorted(by_isize[best_isize])
+    best_ntokens = next((n for n in available if n >= num_tokens), available[-1])
 
-    return f"intermediate_{best_isize}_numtokens_{best_ntokens}"
+    result = CaseKey({"intermediate": best_isize, "numtokens": best_ntokens})
+    _pick_cache[cache_key] = result
+    return result
 
 
 @register_kernel(
diff --git a/vllm/kernels/helion/register.py b/vllm/kernels/helion/register.py
index 30dcbe08c400..f18120da45f9 100644
--- a/vllm/kernels/helion/register.py
+++ b/vllm/kernels/helion/register.py
@@ -36,12 +36,15 @@
 - PresetConfigSearch: Custom autotuner that returns pre-tuned configs
 """
 
+from __future__ import annotations
+
 from collections.abc import Callable
-from typing import Any, cast
+from typing import Any
 
 import torch
 from torch.library import Library
 
+from vllm.kernels.helion.case_key import CaseKey
 from vllm.logger import init_logger
 from vllm.utils.import_utils import has_helion
 from vllm.utils.torch_utils import direct_register_custom_op
@@ -76,9 +79,11 @@
 
 vllm_helion_lib = Library("vllm_helion", "FRAGMENT")  # noqa
 
+ConfigPicker = Callable[[tuple[Any, ...], list[CaseKey]], CaseKey | None]
+
 
 def validate_helion_settings(
-    helion_settings: "helion.Settings | None", op_name: str
+    helion_settings: helion.Settings | None, op_name: str
 ) -> None:
     if helion_settings is None:
         return
@@ -107,7 +112,7 @@ def validate_helion_settings(
 
 def create_helion_decorated_kernel(
     raw_kernel_func: Callable,
-    helion_settings: "helion.Settings | None" = None,
+    helion_settings: helion.Settings | None = None,
     extra_kwargs: dict[str, Any] | None = None,
 ) -> Any:
     kernel_kwargs: dict[str, Any] = {}
@@ -144,9 +149,9 @@ class ConfiguredHelionKernel:
     def __init__(
         self,
         op_name: str,
-        config_picker: Callable[[tuple[Any, ...], list[str]], str | None] | None,
+        config_picker: ConfigPicker | None,
         raw_kernel_func: Callable,
-        helion_settings: "helion.Settings | None" = None,
+        helion_settings: helion.Settings | None = None,
     ):
         self.op_name = op_name
         self.config_picker = config_picker
@@ -170,41 +175,44 @@ def _create_key_computer(self):
                 f"A config_picker must be provided to register_kernel()."
             )
 
-        # After None check, config_picker is guaranteed to be non-None
-        assert self.config_picker is not None
+        picker = self.config_picker
+        all_keys = list(self.configs.keys())
+        default = CaseKey.default()
+        has_default = default in self.configs
 
         def key_computer(*args):
-            config_keys = list(self.configs.keys())
-            # Cast is safe because we checked for None above
-            config_picker = cast(
-                Callable[[tuple[Any, ...], list[str]], str | None], self.config_picker
-            )
-            selected_key = config_picker(args, config_keys)
-            if selected_key:
-                return selected_key
-            return "default" if "default" in self.configs else None
+            selected = picker(args, all_keys)
+            if selected is not None:
+                return str(selected)
+            if has_default:
+                return str(default)
+            return None
 
         return key_computer
 
     def _create_config_selector(self, key_computer):
+        str_to_key = {str(k): k for k in self.configs}
+
         def config_selector(args):
-            # args is a tuple; key_computer expects unpacked args
-            selected_config_key = key_computer(*args)
+            selected_str = key_computer(*args)
 
-            if selected_config_key is None:
+            if selected_str is None:
                 raise ValueError(
-                    f"Config picker returned None for kernel '{self.op_name}' "
-                    f"with available config keys: {list(self.configs.keys())}"
+                    f"Config picker returned None for kernel "
+                    f"'{self.op_name}' with available config keys: "
+                    f"{list(self.configs.keys())}"
                 )
 
-            if selected_config_key not in self.configs:
+            config_key = str_to_key.get(selected_str)
+            if config_key is None:
                 raise ValueError(
                     f"Config picker returned invalid config key "
-                    f"'{selected_config_key}' for kernel '{self.op_name}'. "
+                    f"'{selected_str}' for kernel "
+                    f"'{self.op_name}'. "
                     f"Available keys: {list(self.configs.keys())}"
                 )
 
-            return self.configs[selected_config_key]
+            return self.configs[config_key]
 
         return config_selector
 
@@ -251,9 +259,9 @@ def __init__(
         raw_kernel_func: Callable,
         op_name: str,
         fake_impl: Callable,
-        config_picker: Callable[[tuple[Any, ...], list[str]], str | None],
-        helion_settings: "helion.Settings | None" = None,
-        input_generator: Callable[[], dict[str, tuple[Any, ...]]] | None = None,
+        config_picker: ConfigPicker,
+        helion_settings: helion.Settings | None = None,
+        input_generator: (Callable[[], dict[CaseKey, tuple[Any, ...]]] | None) = None,
     ):
         # Validate helion_settings doesn't conflict with our custom autotuner
         validate_helion_settings(helion_settings, op_name)
@@ -302,7 +310,7 @@ def __call__(self, *args, **kwargs):
         # During eager execution, call the kernel directly.
         return self._configured_kernel(*args, **kwargs)
 
-    def get_inputs(self) -> dict[str, tuple[Any, ...]]:
+    def get_inputs(self) -> dict[CaseKey, tuple[Any, ...]]:
         if self._input_generator is None:
             raise NotImplementedError(
                 f"No input generator registered for kernel '{self.op_name}'. "
@@ -370,7 +378,7 @@ def get_kernel_by_name(kernel_name: str) -> HelionKernelWrapper | None:
 
 def infer_fake_impl(
     kernel_func: Callable,
-    helion_settings: "helion.Settings | None" = None,
+    helion_settings: helion.Settings | None = None,
 ) -> Callable:
     def helion_fake_kernel(*args, **kwargs):
         kernel_kwargs = {}
@@ -392,37 +400,29 @@ def helion_fake_kernel(*args, **kwargs):
 def register_kernel(
     op_name: str | None = None,
     *,
-    config_picker: Callable[[tuple[Any, ...], list[str]], str | None],
+    config_picker: ConfigPicker,
     fake_impl: Callable | None = None,
-    helion_settings: "helion.Settings | None" = None,
-    input_generator: Callable[[], dict[str, tuple[Any, ...]]] | None = None,
+    helion_settings: helion.Settings | None = None,
+    input_generator: (Callable[[], dict[CaseKey, tuple[Any, ...]]] | None) = None,
 ) -> Callable[[Callable], HelionKernelWrapper]:
     """Register a Helion kernel with pre-tuned config selection.
 
-    Wraps the kernel function in a HelionKernelWrapper that eagerly builds
-    the configured kernel and (on older PyTorch) registers a custom op.
-
     Args:
-        config_picker: Required. Function with signature
-            ``(args: tuple, config_keys: list[str]) -> str | None``
-            that picks the best config key from available options.
-            Return ``None`` to fall back to ``"default"``.
+        config_picker: Required. Receives ``(args, config_keys)``
+            where each config key is a ``dict[str, Any]`` mapping
+            parameter names to values.  Return the best-matching
+            dict, or ``None`` to fall back to the default config.
 
             Example::
 
                 def pick_config(args, config_keys):
                     x = args[0]
-                    hidden_size = x.shape[-1]
-                    batch_size = x.shape[0]
-                    for key in config_keys:
-                        if key == f"hiddensize_{hidden_size}_batchsize_{batch_size}":
-                            return key
-                    return "default" if "default" in config_keys else None
-
-        input_generator: Optional. Function that returns
-            ``dict[str, tuple]`` where each key is a configuration
-            identifier (e.g. ``"4096"``, ``"hidden_4096"``) and each
-            value is a tuple of arguments to pass to the kernel.
+                    best = min(config_keys, key=lambda k: abs(k["size"] - x.shape[0]))
+                    return best
+
+        input_generator: Optional. Returns ``dict[str, tuple]`` where
+            each key is a serialized config key and each value is a
+            tuple of arguments to pass to the kernel.
 
             Example::
 
diff --git a/vllm/kernels/oink_ops.py b/vllm/kernels/oink_ops.py
index e8e3cb91f857..835cd062d037 100644
--- a/vllm/kernels/oink_ops.py
+++ b/vllm/kernels/oink_ops.py
@@ -1,6 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""This file registers Oink implementations for vLLM IR ops.
+
+vLLM does not depend on the external Oink repository/package. When an external
+plugin registers torch.library.custom_op entrypoints under the `oink::`
+namespace (e.g. via vLLM's general_plugins mechanism), these ops will be marked
+ as supported. To dispatch to those ops, set kernel_config.ir_op_priority.<op> to oink.
+Alternatively, `VLLM_USE_OINK_OPS=1` will add this to priority by default.
+"""
+
 import torch
+from torch import Tensor
 
 from vllm import ir
 from vllm.platforms import current_platform
@@ -15,7 +25,7 @@ def has_oink_op(name: str) -> bool:
     return OINK_AVAILABLE and hasattr(torch.ops.oink, name)
 
 
-def _can_view_as_2d(x: torch.Tensor) -> bool:
+def _can_view_as_2d(x: Tensor) -> bool:
     """Return True if x.view(-1, x.shape[-1]) is viewable (no copy)."""
     if x.dim() < 2:
         return False
@@ -32,7 +42,7 @@ def _can_view_as_2d(x: torch.Tensor) -> bool:
     return True
 
 
-def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
+def _is_oink_stride_compatible_2d(x_2d: Tensor) -> bool:
     """Return True if x_2d meets Oink's pointer-path stride constraints."""
     if x_2d.dim() != 2:
         return False
@@ -67,11 +77,51 @@ def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
     "oink", supports_args=oink_rms_supported, supported=has_oink_op("rmsnorm")
 )
 def rms_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor | None,
+    x: Tensor,
+    weight: Tensor | None,
     epsilon: float,
     variance_size: int | None = None,
-) -> torch.Tensor:
+) -> Tensor:
     assert variance_size is None
     x_2d = x.view(-1, x.shape[-1])
     return torch.ops.oink.rmsnorm(x_2d, weight, epsilon).view_as(x)
+
+
+oink_add_rms_supported = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and weight is not None
+    and x.dim() >= 2
+    and x.dtype == weight.dtype
+    and weight.is_contiguous()
+    and _can_view_as_2d(x)
+    and _is_oink_stride_compatible_2d(x.view(-1, x.shape[-1]))
+    # residual must have 2d-compatible strides and match x shape/dtype
+    and x.dtype == x_residual.dtype
+    and x.shape == x_residual.shape
+    and _can_view_as_2d(x_residual)
+    and _is_oink_stride_compatible_2d(x_residual.view(-1, x_residual.shape[-1]))
+)
+"""
+Oink fused_add_rms_norm has the same constraints as rms_norm,
+and residual must be 2d-like with compatible strides.
+"""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "oink",
+    supports_args=oink_add_rms_supported,
+    supported=has_oink_op("fused_add_rms_norm"),
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    assert variance_size is None
+    x_2d = x.view(-1, x.shape[-1])
+    residual_2d = x_residual.view(-1, x_residual.shape[-1])
+    torch.ops.oink.fused_add_rms_norm(x_2d, residual_2d, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/kernels/vllm_c.py b/vllm/kernels/vllm_c.py
index 124b02e4e27a..5c602c39843b 100644
--- a/vllm/kernels/vllm_c.py
+++ b/vllm/kernels/vllm_c.py
@@ -31,3 +31,33 @@ def rms_norm(
     output = torch.empty(x.shape, device=x.device, dtype=x.dtype)
     torch.ops._C.rms_norm(output, x, weight, epsilon)
     return output
+
+
+rms_add_no_var_size = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and (weight is None or weight.dtype == x.dtype)
+)
+"""vLLM Kernel does not support variance_size parameter and requires
+matching input/weight dtype."""
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "vllm_c",
+    supports_args=rms_add_no_var_size,
+    supported=CUDA_ALIKE,
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    if weight is None:
+        # Kernel requires weight tensor, pass ones
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+
+    assert variance_size is None
+    torch.ops._C.fused_add_rms_norm(x, x_residual, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/kernels/xpu_ops.py b/vllm/kernels/xpu_ops.py
index c680c542c1df..5e7f90f70868 100644
--- a/vllm/kernels/xpu_ops.py
+++ b/vllm/kernels/xpu_ops.py
@@ -36,3 +36,31 @@ def rms_norm(
     output = torch.empty(x.shape, device=x.device, dtype=x.dtype)
     torch.ops._C.rms_norm(output, x, weight, epsilon)
     return output
+
+
+rms_add_no_var_size = (
+    lambda x, x_residual, weight, epsilon, variance_size=None: variance_size is None
+    and (weight is None or weight.dtype == x.dtype)
+)
+
+
+@ir.ops.fused_add_rms_norm.register_impl(
+    "xpu_kernels",
+    supports_args=rms_add_no_var_size,
+    supported=XPU_KERNELS_SUPPORTED,
+    inplace=True,
+)
+def fused_add_rms_norm(
+    x: Tensor,
+    x_residual: Tensor,
+    weight: Tensor | None,
+    epsilon: float,
+    variance_size: int | None = None,
+) -> tuple[Tensor, Tensor]:
+    if weight is None:
+        # Kernel requires weight tensor, pass ones
+        weight = torch.ones(x.shape[-1], device=x.device, dtype=x.dtype)
+
+    assert variance_size is None
+    torch.ops._C.fused_add_rms_norm(x, x_residual, weight, epsilon)
+    return x, x_residual
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 284ac54997fb..2f9a4701b0d1 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -335,6 +335,10 @@ def forward(self, *args, **kwargs):
     def quant_method(self):
         return self.base_layer.quant_method
 
+    @property
+    def runner(self):
+        return self.base_layer.runner
+
     @property
     def is_internal_router(self) -> bool:
         return self.base_layer.is_internal_router
diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py
index 7c1dd39bb5e3..48a2f6a1edd0 100644
--- a/vllm/lora/lora_model.py
+++ b/vllm/lora/lora_model.py
@@ -203,6 +203,7 @@ def check_unexpected_modules(modules: dict):
             tensors = TensorDeserializer(
                 lora_tensor_path,
                 dtype=tensorizer_config.dtype,
+                device=device,
                 **tensorizer_args.deserialization_kwargs,
             )
             check_unexpected_modules(tensors)
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
old mode 100644
new mode 100755
index f031e1bfa341..20dde67b068d
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -14,6 +14,7 @@
 from vllm import _custom_ops as ops
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.ops.xpu_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink
+from vllm.lora.utils import get_captured_lora_counts
 from vllm.triton_utils import HAS_TRITON, triton
 from vllm.utils.math_utils import round_up
 
@@ -48,8 +49,24 @@ def __init__(
 
         self.lora_config = kwargs["lora_config"]
         self.max_loras = self.lora_config.max_loras
+
+        # Compute captured LoRA counts for cudagraph specialization.
+        captured_lora_counts = get_captured_lora_counts(
+            self.max_loras, self.lora_config.specialize_active_lora
+        )
+
         self.token_mapping_meta = LoRAKernelMeta.make(
-            self.max_loras, max_num_batched_tokens, device=device
+            self.max_loras,
+            max_num_batched_tokens,
+            device=device,
+            captured_lora_counts=captured_lora_counts,
+        )
+
+        self.prompt_mapping_meta = LoRAKernelMeta.make(
+            self.max_loras,
+            max_num_batched_tokens,
+            device=device,
+            captured_lora_counts=captured_lora_counts,
         )
 
     def update_metadata(
@@ -63,6 +80,10 @@ def update_metadata(
         self.is_prefill = mapping.is_prefill
         self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
+        # Prepare kernel metadata tensors
+        self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
+        self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
+
     def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
         return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
 
@@ -419,3 +440,239 @@ def add_lora_fused_moe(
             fully_sharded,
             offset,
         )
+
+    def add_lora_w13(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        expert_map: torch.Tensor | None,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        num_tokens: int,
+        top_k_num: int,
+        max_loras: int,
+        adapter_enabled: torch.Tensor,
+        local_num_experts: int,
+        top_k: int,
+        num_slices: int,
+        fully_sharded: bool,
+        use_tuned_config: bool,
+    ) -> tuple[
+        torch.Tensor | None,
+        torch.Tensor | None,
+        torch.Tensor | None,
+        torch.Tensor | None,
+    ]:
+        import functools
+
+        from vllm.lora.layers.utils import try_get_optimal_moe_lora_config
+        from vllm.lora.ops.triton_ops.utils import (
+            _normalize_lora_config_keys,
+            get_lora_op_configs,
+        )
+        from vllm.model_executor.layers.fused_moe.config import _get_config_dtype_str
+
+        config_dtype = _get_config_dtype_str(
+            dtype=x.dtype,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False,
+        )
+        max_lora_rank = lora_a_stacked[0].shape[-2]
+
+        if use_tuned_config:
+            shrink_config = get_lora_op_configs(
+                op_type="fused_moe_lora_w13_shrink",
+                max_loras=max_loras,
+                batch=num_tokens,
+                hidden_size=x.shape[-1],
+                rank=max_lora_rank,
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked[0].shape[-2],
+            )
+            expand_config = get_lora_op_configs(
+                op_type="fused_moe_lora_w13_expand",
+                max_loras=max_loras,
+                batch=num_tokens,
+                hidden_size=x.shape[-1],
+                rank=max_lora_rank,
+                num_slices=num_slices,
+                moe_intermediate_size=lora_b_stacked[0].shape[-2],
+            )
+        else:
+            get_config = functools.partial(
+                try_get_optimal_moe_lora_config,
+                w1_shape=w1.shape,
+                w2_shape=w2.shape,
+                rank=max_lora_rank,
+                top_k=top_k,
+                dtype=config_dtype,
+                M=num_tokens,
+            )
+            shrink_config = get_config(op_type="fused_moe_lora_w13_shrink")
+            expand_config = get_config(op_type="fused_moe_lora_w13_expand")
+
+        shrink_config = _normalize_lora_config_keys(shrink_config)
+        expand_config = _normalize_lora_config_keys(expand_config)
+
+        SPARSITY_FACTOR = 8
+        naive_block_assignment = (
+            expert_map is None
+            and num_tokens * top_k * SPARSITY_FACTOR <= local_num_experts * max_loras
+        )
+
+        (
+            token_lora_mapping,
+            sorted_token_ids_lora,
+            expert_ids_lora,
+            num_tokens_post_padded_lora,
+        ) = self.moe_lora_align_block_size(
+            topk_ids,
+            num_tokens,
+            int(shrink_config.get("BLOCK_SIZE_M") or 64),
+            local_num_experts,
+            max_loras,
+            adapter_enabled,
+            expert_map,
+            naive_block_assignment=naive_block_assignment,
+        )
+
+        _sorted = sorted_token_ids_lora
+        _eids = expert_ids_lora
+        if _sorted is not None:
+            _eids = _eids.view(max_loras, -1)
+            _sorted = _sorted.view(max_loras, -1)
+
+        self.add_lora_fused_moe(
+            y.view(-1, top_k_num, y.shape[-1]),
+            x,
+            lora_a_stacked,
+            lora_b_stacked,
+            topk_weights,
+            _sorted,
+            _eids,
+            num_tokens_post_padded_lora,
+            max_lora_rank,
+            top_k,
+            shrink_config,
+            expand_config,
+            adapter_enabled,
+            fully_sharded=fully_sharded,
+            token_lora_mapping=token_lora_mapping,
+        )
+
+        return (
+            sorted_token_ids_lora,
+            expert_ids_lora,
+            num_tokens_post_padded_lora,
+            token_lora_mapping,
+        )
+
+    def add_lora_w2(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
+        topk_weights: torch.Tensor,
+        sorted_token_ids_lora: torch.Tensor | None,
+        expert_ids_lora: torch.Tensor | None,
+        num_tokens_post_padded_lora: torch.Tensor | None,
+        token_lora_mapping: torch.Tensor | None,
+        num_tokens: int,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        top_k_num: int,
+        max_loras: int,
+        adapter_enabled: torch.Tensor,
+        top_k: int,
+        fully_sharded: bool,
+        tp_rank: int,
+        use_tuned_config: bool,
+    ) -> None:
+        import functools
+
+        from vllm.lora.layers.utils import try_get_optimal_moe_lora_config
+        from vllm.lora.ops.triton_ops.utils import (
+            _normalize_lora_config_keys,
+            get_lora_op_configs,
+        )
+        from vllm.model_executor.layers.fused_moe.config import _get_config_dtype_str
+
+        config_dtype = _get_config_dtype_str(
+            dtype=x.dtype,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=False,
+            use_int4_w4a16=False,
+        )
+        max_lora_rank = lora_a_stacked[0].shape[-2]
+
+        if use_tuned_config:
+            shrink_config = get_lora_op_configs(
+                op_type="fused_moe_lora_w2_shrink",
+                max_loras=max_loras,
+                batch=num_tokens,
+                hidden_size=y.shape[-1],
+                rank=max_lora_rank,
+                num_slices=1,
+                moe_intermediate_size=lora_a_stacked[0].shape[-1],
+            )
+            expand_config = get_lora_op_configs(
+                op_type="fused_moe_lora_w2_expand",
+                max_loras=max_loras,
+                batch=num_tokens,
+                hidden_size=y.shape[-1],
+                rank=max_lora_rank,
+                num_slices=1,
+                moe_intermediate_size=lora_a_stacked[0].shape[-1],
+            )
+        else:
+            get_config = functools.partial(
+                try_get_optimal_moe_lora_config,
+                w1_shape=w1.shape,
+                w2_shape=w2.shape,
+                rank=max_lora_rank,
+                top_k=top_k,
+                dtype=config_dtype,
+                M=num_tokens,
+            )
+            shrink_config = get_config(op_type="fused_moe_lora_w2_shrink")
+            expand_config = get_config(op_type="fused_moe_lora_w2_expand")
+
+        shrink_config = _normalize_lora_config_keys(shrink_config)
+        expand_config = _normalize_lora_config_keys(expand_config)
+
+        _sorted = sorted_token_ids_lora
+        _eids = expert_ids_lora
+        if _sorted is not None:
+            assert _eids is not None
+            _eids = _eids.view(max_loras, -1)
+            _sorted = _sorted.view(max_loras, -1)
+
+        # w2_lora_b shape[-2] is hidden_size // tp_size when fully_sharded
+        shard_size = lora_b_stacked[0].shape[-2]
+        offset = shard_size * tp_rank if fully_sharded else 0
+
+        self.add_lora_fused_moe(
+            y,
+            x,
+            lora_a_stacked,
+            lora_b_stacked,
+            topk_weights,
+            _sorted,
+            _eids,
+            num_tokens_post_padded_lora,
+            max_lora_rank,
+            top_k,
+            shrink_config,
+            expand_config,
+            adapter_enabled,
+            True,  # mul_routed_weight
+            fully_sharded=fully_sharded,
+            offset=offset,
+            token_lora_mapping=token_lora_mapping,
+        )
diff --git a/vllm/model_executor/kernels/linear/__init__.py b/vllm/model_executor/kernels/linear/__init__.py
index 5d513f767f03..b9a96a035309 100644
--- a/vllm/model_executor/kernels/linear/__init__.py
+++ b/vllm/model_executor/kernels/linear/__init__.py
@@ -110,6 +110,7 @@
     AiterPreshuffledPerTokenFp8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cpu import (
+    CPUFp8BlockScaledMMKernel,
     CPUInt8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
@@ -199,6 +200,9 @@
         AiterFp8BlockScaledMMKernel,
         TritonFp8BlockScaledMMKernel,
     ],
+    PlatformEnum.CPU: [
+        CPUFp8BlockScaledMMKernel,
+    ],
 }
 
 _POSSIBLE_WFP8A16_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = {
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/__init__.py b/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
index e86684b2f8a1..f8f12f7b0cba 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/__init__.py
@@ -8,6 +8,7 @@
     Fp8BlockScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cpu import (
+    CPUFp8BlockScaledMMKernel,
     CPUInt8ScaledMMLinearKernel,
 )
 from vllm.model_executor.kernels.linear.scaled_mm.cutlass import (
@@ -58,4 +59,5 @@
     "ROCmFP8ScaledMMLinearKernel",
     "TritonInt8ScaledMMLinearKernel",
     "Fp8BlockScaledMMLinearKernel",
+    "CPUFp8BlockScaledMMKernel",
 ]
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/aiter.py b/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
index 8a8650d22135..5ded5ca798ad 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/aiter.py
@@ -312,6 +312,21 @@ def apply_block_scaled_mm(
         As: torch.Tensor,
         Bs: torch.Tensor,
     ) -> torch.Tensor:
+        if As.dtype != Bs.dtype:
+            from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+                _upcast_e8m0_to_fp32,
+            )
+
+            if As.dtype == torch.float8_e8m0fnu:
+                As = _upcast_e8m0_to_fp32(As).contiguous()
+            else:
+                As = As.to(torch.float32)
+
+            if Bs.dtype == torch.float8_e8m0fnu:
+                Bs = _upcast_e8m0_to_fp32(Bs).contiguous()
+            else:
+                Bs = Bs.to(torch.float32)
+
         out_dtype = self.config.out_dtype
         if self.use_triton:
             gemm_a8w8_blockscale_op = rocm_aiter_ops.triton_gemm_a8w8_blockscale
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/cpu.py b/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
index 3d67a73af433..083cb473aaca 100644
--- a/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
+++ b/vllm/model_executor/kernels/linear/scaled_mm/cpu.py
@@ -14,6 +14,10 @@
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 
+from .BlockScaledMMLinearKernel import (
+    Fp8BlockScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
+)
 from .ScaledMMLinearKernel import (
     Int8ScaledMMLinearKernel,
     Int8ScaledMMLinearLayerConfig,
@@ -215,3 +219,109 @@ def _apply_weights_sgl(
             x.dtype,
             True,
         )
+
+
+class CPUFp8BlockScaledMMKernel(Fp8BlockScaledMMLinearKernel):
+    """FP8 W8A16 block-quantized GEMM via AMX BRGEMM on CPU."""
+
+    # Input stays BF16 — no FP8 activation quantization.
+    apply_input_quant = False
+
+    @classmethod
+    def is_supported(
+        cls, compute_capability: int | None = None
+    ) -> tuple[bool, str | None]:
+        if not current_platform.is_cpu():
+            return False, "requires CPU platform."
+        if not torch.cpu._is_amx_tile_supported():
+            return False, "requires AMX tile support (Sapphire Rapids or newer)."
+        if not ops._supports_cpu_fp8_w8a16:
+            return False, "fp8_scaled_mm_cpu op not available."
+        return True, None
+
+    @classmethod
+    def can_implement(
+        cls, config: FP8ScaledMMLinearLayerConfig
+    ) -> tuple[bool, str | None]:
+        # Validate weight block shape
+        weight_gs = config.weight_quant_key.scale.group_shape
+        if weight_gs.col <= 0 or weight_gs.col != 128:
+            return False, (
+                "CPU FP8 kernel requires K-dimension block size of 128, "
+                f"got {weight_gs.col}."
+            )
+        if weight_gs.row <= 0 or weight_gs.row % 32 != 0:
+            return False, (
+                "CPU FP8 kernel requires N-dimension block size to be "
+                f"a positive multiple of 32, got {weight_gs.row}."
+            )
+        if config.out_dtype not in (torch.bfloat16, torch.float32):
+            return False, "Only bfloat16/float32 output dtype supported."
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Skip the base class process (FP8 padding / fnuz normalization)
+        # which is GPU-oriented.  Instead, VNNI-prepack weights for AMX.
+        params = self._get_layer_params(layer)
+        packed_weight = torch.ops._C.convert_weight_packed(params.weight)
+        replace_parameter(
+            layer,
+            params.WEIGHT,
+            torch.nn.Parameter(packed_weight, requires_grad=False),
+        )
+
+        # Re-wrap scale as a plain Parameter so the kernel can read it
+        # without weight-loader metadata interfering.
+        scale_attr = (
+            params.WEIGHT_SCALE_INV
+            if params.weight_scale_inv is not None
+            else params.WEIGHT_SCALE
+        )
+        weight_scale = (
+            params.weight_scale_inv
+            if params.weight_scale_inv is not None
+            else params.weight_scale
+        )
+        assert weight_scale is not None
+        replace_parameter(
+            layer,
+            scale_attr,
+            torch.nn.Parameter(weight_scale.data, requires_grad=False),
+        )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        params = self._get_layer_params(layer)
+        weight_scale = (
+            params.weight_scale_inv
+            if params.weight_scale_inv is not None
+            else params.weight_scale
+        )
+
+        x_2d = x.reshape(-1, x.shape[-1]) if x.dim() > 2 else x
+        out = torch.ops._C.fp8_scaled_mm_cpu(
+            x_2d,
+            params.weight,
+            weight_scale,
+            list(self.weight_group_shape),
+            bias,
+            x.dtype,
+            True,  # is_vnni (weight already prepacked)
+        )
+        return out.reshape(x.shape[:-1] + (out.size(-1),)) if x.dim() > 2 else out
+
+    def apply_block_scaled_mm(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        As: torch.Tensor,
+        Bs: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "CPUFp8BlockScaledMMKernel overrides apply_weights directly."
+        )
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index 59cc95f18c58..df9459012ae8 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -169,7 +169,9 @@ class SiluAndMulWithClamp(CustomOp):
     def __init__(self, swiglu_limit: float, *, compile_native: bool = True):
         super().__init__(compile_native=compile_native)
         self.swiglu_limit = float(swiglu_limit)
-        if current_platform.is_cuda_alike() or current_platform.is_xpu():
+        if current_platform.is_rocm():
+            self._forward_method = self.forward_native
+        elif current_platform.is_cuda_alike() or current_platform.is_xpu():
             self.op = torch.ops._C.silu_and_mul_with_clamp
         elif current_platform.is_cpu():
             self._forward_method = self.forward_native
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 82eecc8cd49b..9cf7ebf5cdfc 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -259,7 +259,10 @@
     MLAAttentionImpl,
     SparseMLAAttentionImpl,
 )
-from vllm.v1.attention.backends.mla.prefill import MLAPrefillBackend
+from vllm.v1.attention.backends.mla.prefill import (
+    MLAPrefillBackend,
+    get_mla_prefill_backend,
+)
 from vllm.v1.attention.backends.utils import (
     get_dcp_local_seq_lens,
     split_decodes_and_prefills,
@@ -451,20 +454,32 @@ def __init__(
         self.q_pad_num_heads = getattr(self.impl, "q_pad_num_heads", None)
         self.use_direct_call = not current_platform.opaque_attention_op()
 
-        compilation_config = get_current_vllm_config().compilation_config
+        vllm_config = get_current_vllm_config()
+        compilation_config = vllm_config.compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
         compilation_config.static_forward_context[prefix] = self
 
+        prefill_backend_cls = get_mla_prefill_backend(vllm_config)
+        self.prefill_backend = prefill_backend_cls(
+            num_heads=self.num_heads,
+            scale=self.scale,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            vllm_config=vllm_config,
+        )
+
         self.kv_cache = torch.tensor([])
 
         self.use_sparse = use_sparse
 
-        vllm_config = get_current_vllm_config_or_none()
+        _vllm_config = get_current_vllm_config_or_none()
         self.dcp_a2a = (
-            vllm_config is not None
-            and vllm_config.parallel_config.decode_context_parallel_size > 1
-            and vllm_config.parallel_config.dcp_comm_backend == "a2a"
+            _vllm_config is not None
+            and _vllm_config.parallel_config.decode_context_parallel_size > 1
+            and _vllm_config.parallel_config.dcp_comm_backend == "a2a"
         )
 
         # Initialize q/k/v range constants.
@@ -1522,20 +1537,9 @@ def __init__(
                 device=device,
             )
 
-        from vllm.v1.attention.backends.mla.prefill import get_mla_prefill_backend
-
-        prefill_backend_cls = get_mla_prefill_backend(vllm_config)
-        self._prefill_backend = prefill_backend_cls(
-            num_heads=self.num_heads,
-            scale=self.model_config.get_head_size() ** -0.5,
-            kv_lora_rank=self.mla_dims.kv_lora_rank,
-            qk_nope_head_dim=self.mla_dims.qk_nope_head_dim,
-            qk_rope_head_dim=self.mla_dims.qk_rope_head_dim,
-            v_head_dim=self.mla_dims.v_head_dim,
-            vllm_config=vllm_config,
-            device=device,
-            layer_names=layer_names,
-        )
+        self._prefill_backend = self.compilation_config.static_forward_context[
+            layer_names[0]
+        ].prefill_backend
 
         supports_spec_decode = self.query_len_support != QueryLenSupport.SINGLE_ONLY
         self._init_reorder_batch_threshold(
diff --git a/vllm/model_executor/layers/deepseek_compressor.py b/vllm/model_executor/layers/deepseek_compressor.py
index cae80c35316a..48628fec46e0 100644
--- a/vllm/model_executor/layers/deepseek_compressor.py
+++ b/vllm/model_executor/layers/deepseek_compressor.py
@@ -300,6 +300,7 @@ def forward(
         state_cache = self.state_cache.kv_cache
         # kv_state stored in first half, score_state stored in second half
         state_width = state_cache.shape[-1] // 2
+        pdl_kwargs = {} if current_platform.is_rocm() else {"launch_pdl": False}
 
         # Store the KV and score (with fused APE addition) in the state.
         # NOTE: PDL is disabled — both this kernel and _fused_kernel below
@@ -324,7 +325,7 @@ def forward(
             TRITON_BLOCK_SIZE=triton.next_power_of_2(kv.shape[-1]),
             STATE_WIDTH=state_width,
             COMPRESS_RATIO=self.compress_ratio,
-            launch_pdl=False,
+            **pdl_kwargs,
         )
 
         # Fused: compress → RMSNorm → RoPE → FP8 quant → KV cache write.
@@ -373,7 +374,7 @@ def forward(
             SCALE_DIM=self._scale_dim,
             KV_BLOCK_STRIDE=kv_cache.stride(0),
             num_warps=self._num_warps,
-            launch_pdl=False,
+            **pdl_kwargs,
         )
 
 
diff --git a/vllm/model_executor/layers/deepseek_v4_attention.py b/vllm/model_executor/layers/deepseek_v4_attention.py
index 847c3eee55a8..494d61338084 100644
--- a/vllm/model_executor/layers/deepseek_v4_attention.py
+++ b/vllm/model_executor/layers/deepseek_v4_attention.py
@@ -28,6 +28,11 @@
     fused_inv_rope_fp8_quant,
     fused_q_kv_rmsnorm,
 )
+from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+    rocm_forward_decode_fallback,
+    rocm_inv_rope_einsum,
+    rocm_sparse_attn_prefill,
+)
 
 if TYPE_CHECKING:
     from vllm.v1.attention.backends.mla.sparse_swa import (
@@ -53,6 +58,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
 )
+from vllm.platforms import current_platform
 from vllm.utils.multi_stream_utils import (
     execute_in_parallel,
     maybe_execute_in_parallel,
@@ -198,8 +204,6 @@ def __init__(
         # Pick fp8_einsum recipe based on GPU arch:
         # SM90: FP32 block scales stay [g, r/128, d/128] → sfb_gran_mn=128
         # SM100: INT32 packed scales become [g, r, ...] → sfb_gran_mn=1
-        from vllm.platforms import current_platform
-
         cap = current_platform.get_device_capability()
         assert cap is not None, "DeepseekV4 attention requires a CUDA device"
         self._einsum_recipe = (1, 128, 128) if cap.major <= 9 else (1, 1, 128)
@@ -222,6 +226,7 @@ def __init__(
             + 1  # 1B pad
         )
 
+        # Will be None on ROCm for now.
         self.aux_stream_list = mla_modules.aux_stream_list
         # [0]: GEMM start / post-GEMM event0. [1..3]: GEMM done events;
         # [1] doubles as post-GEMM event1. Reuse is safe: GEMM fully joins
@@ -303,6 +308,19 @@ def forward(
         )
         o = o_padded[:, : self.n_local_heads, :]
 
+        # Keep ROCm on the BF16 reference wo_a path util kernel ready.
+        if current_platform.is_rocm():
+            z = rocm_inv_rope_einsum(
+                self.rotary_emb,
+                o,
+                positions,
+                self.rope_head_dim,
+                self.n_local_groups,
+                self.o_lora_rank,
+                self.wo_a,
+            )
+            return self.wo_b(z.flatten(1))
+
         # O projection: inverse RoPE + FP8 quant + einsum + wo_b
         o_fp8, o_scale = fused_inv_rope_fp8_quant(
             o,
@@ -336,12 +354,15 @@ def forward(
         return self.wo_b(z.flatten(1))
 
     def attn_gemm_parallel_execute(self, hidden_states) -> tuple[Any, ...]:
-        assert self.aux_stream_list is not None
-        assert len(self.aux_stream_list) >= 3
+        aux_streams = self.aux_stream_list
+        if aux_streams is not None:
+            assert len(aux_streams) >= 3
+            aux_streams = aux_streams[:3]
 
         # fused_wqa_wkv (heaviest) on default; the three lighter input GEMMs
         # on aux streams 0..2 when their owning module exists. ln_events[0]
         # is the fan-out start event; ln_events[1..3] are per-aux done events.
+        # On ROCm, aux_streams is None and execute_in_parallel runs serially.
         aux_fns: list[Callable[[], Any] | None] = [None, None, None]
 
         if self.compressor is not None:
@@ -385,7 +406,7 @@ def fused_wqa_wkv() -> torch.Tensor:
             aux_fns,
             self.ln_events[0],
             self.ln_events[1:4],
-            self.aux_stream_list[:3],
+            aux_streams,
             enable=hidden_states.shape[0]
             <= envs.VLLM_MULTI_STREAM_GEMM_TOKEN_THRESHOLD,
         )
@@ -419,8 +440,9 @@ def attention_impl(
         # downstream reads q on default). Indexer/compressor go on aux for
         # overlap with default's GEMM + cache write.
         if self.indexer is not None:
-            assert self.aux_stream_list is not None
-            aux_stream = self.aux_stream_list[0]
+            aux_stream = (
+                self.aux_stream_list[0] if self.aux_stream_list is not None else None
+            )
             indexer = self.indexer
             # Local ref so the closure keeps a non-None type for mypy.
             assert self.compressor is not None
@@ -448,8 +470,9 @@ def wq_b_kv_insert_and_compress() -> torch.Tensor:
             )
         elif self.compressor is not None:
             # wq_b + kv_insert on default, compressor on aux.
-            assert self.aux_stream_list is not None
-            aux_stream = self.aux_stream_list[0]
+            aux_stream = (
+                self.aux_stream_list[0] if self.aux_stream_list is not None else None
+            )
             compressor = self.compressor
 
             def wq_b_kv_insert() -> torch.Tensor:
@@ -668,7 +691,7 @@ def __init__(
             vllm_config.scheduler_config.max_num_batched_tokens
         )
         self.max_model_len = vllm_config.model_config.max_model_len
-        # DeepseekV4 only supports fp8 kv-cache format for now
+        # DeepseekV4 only supports fp8 kv-cache format for now.
         kv_cache_dtype = cache_config.cache_dtype if cache_config is not None else "fp8"
 
         assert kv_cache_dtype.startswith("fp8"), (
@@ -816,6 +839,25 @@ def _forward_decode(
         swa_indices = swa_metadata.decode_swa_indices
         swa_lens = swa_metadata.decode_swa_lens
 
+        if current_platform.is_rocm():
+            rocm_forward_decode_fallback(
+                q=q,
+                kv_cache=kv_cache,
+                swa_k_cache=self.swa_cache_layer.kv_cache,
+                swa_only=swa_only,
+                topk_indices=topk_indices,
+                topk_lens=topk_lens,
+                swa_indices=swa_indices,
+                swa_lens=swa_lens,
+                attn_sink=self.attn_sink,
+                scale=self.scale,
+                head_dim=self.head_dim,
+                nope_head_dim=self.nope_head_dim,
+                rope_head_dim=self.rope_head_dim,
+                output=output,
+            )
+            return
+
         # We treat queries in the same seq as different queries
         # and later we only attend by generated indices.
         # q arrives pre-padded to self.padded_heads by the outer wrapper.
@@ -980,15 +1022,27 @@ def _forward_prefill(
                 N,
             )
 
-            output_chunk, _, _ = flash_mla_sparse_fwd(
-                q=q[query_start:query_end],
-                kv=kv.view(-1, 1, q.shape[-1]),
-                indices=combined_indices.unsqueeze(1),
-                sm_scale=self.scale,
-                attn_sink=self.attn_sink,
-                topk_length=combined_lens,
-                out=output[query_start:query_end],
-            )
+            if current_platform.is_rocm():
+                rocm_sparse_attn_prefill(
+                    q=q[query_start:query_end],
+                    kv=kv.view(-1, 1, q.shape[-1]),
+                    indices=combined_indices.unsqueeze(1),
+                    topk_length=combined_lens,
+                    scale=self.scale,
+                    head_dim=self.head_dim,
+                    attn_sink=self.attn_sink,
+                    output=output[query_start:query_end],
+                )
+            else:
+                output_chunk, _, _ = flash_mla_sparse_fwd(
+                    q=q[query_start:query_end],
+                    kv=kv.view(-1, 1, q.shape[-1]),
+                    indices=combined_indices.unsqueeze(1),
+                    sm_scale=self.scale,
+                    attn_sink=self.attn_sink,
+                    topk_length=combined_lens,
+                    out=output[query_start:query_end],
+                )
 
 
 class DeepseekV4IndexerCache(torch.nn.Module, AttentionLayerBase):
diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py
index 02e48921d419..caf8b0c97654 100644
--- a/vllm/model_executor/layers/fla/ops/chunk.py
+++ b/vllm/model_executor/layers/fla/ops/chunk.py
@@ -32,6 +32,7 @@ def chunk_gated_delta_rule_fwd(
     cu_seqlens: torch.Tensor | None = None,
     chunk_indices: torch.Tensor | None = None,
     chunk_offsets: torch.Tensor | None = None,
+    core_attn_out: torch.Tensor | None = None,
 ):
     g = chunk_local_cumsum(
         g, chunk_size=FLA_CHUNK_SIZE, cu_seqlens=cu_seqlens, chunk_indices=chunk_indices
@@ -77,6 +78,7 @@ def chunk_gated_delta_rule_fwd(
         scale=scale,
         cu_seqlens=cu_seqlens,
         chunk_indices=chunk_indices,
+        core_attn_out=core_attn_out,
     )
     if SUPPRESS_LEVEL < 3:
         return g, o, A, final_state, None, None, None
@@ -102,6 +104,7 @@ def forward(
         chunk_indices: torch.Tensor | None = None,
         chunk_offsets: torch.Tensor | None = None,
         use_qk_l2norm_in_kernel: bool = False,
+        core_attn_out: torch.Tensor | None = None,
     ):
         if use_qk_l2norm_in_kernel:
             q = l2norm_fwd(q)
@@ -119,9 +122,15 @@ def forward(
             cu_seqlens=cu_seqlens,
             chunk_indices=chunk_indices,
             chunk_offsets=chunk_offsets,
+            core_attn_out=core_attn_out,
         )
         ctx.scale = scale
         ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel
+        if core_attn_out is not None:
+            assert not torch.is_grad_enabled(), (
+                "core_attn_out buffer reuse is only supported for inference"
+            )
+            assert q.dtype == o.dtype, "Incompatible dtype for inplace computation"
         return o.to(q.dtype), final_state
 
 
@@ -139,6 +148,7 @@ def chunk_gated_delta_rule(
     chunk_indices: torch.Tensor | None = None,
     chunk_offsets: torch.Tensor | None = None,
     use_qk_l2norm_in_kernel: bool = False,
+    core_attn_out: torch.Tensor | None = None,
 ):
     r"""
     Args:
@@ -230,5 +240,6 @@ def chunk_gated_delta_rule(
         chunk_indices,
         chunk_offsets,
         use_qk_l2norm_in_kernel,
+        core_attn_out,
     )
     return o, final_state
diff --git a/vllm/model_executor/layers/fla/ops/chunk_o.py b/vllm/model_executor/layers/fla/ops/chunk_o.py
index d812ec433720..0c323b8ce215 100644
--- a/vllm/model_executor/layers/fla/ops/chunk_o.py
+++ b/vllm/model_executor/layers/fla/ops/chunk_o.py
@@ -148,6 +148,7 @@ def chunk_fwd_o(
     cu_seqlens: torch.Tensor | None = None,
     chunk_indices: torch.Tensor | None = None,
     chunk_size: int = FLA_CHUNK_SIZE,
+    core_attn_out: torch.Tensor | None = None,
 ) -> torch.Tensor:
     B, T, Hg, K, V = *q.shape, v.shape[-1]
     H = v.shape[-2]
@@ -158,7 +159,13 @@ def chunk_fwd_o(
     if scale is None:
         scale = k.shape[-1] ** -0.5
 
-    o = torch.empty_like(v)
+    if core_attn_out is not None:
+        assert core_attn_out.numel() >= v.numel(), (
+            f"core_attn_out too small: {core_attn_out.numel()} < {v.numel()}"
+        )
+        o = core_attn_out[: v.numel()].view(*v.shape)
+    else:
+        o = torch.empty_like(v)
 
     def grid(meta):
         return (triton.cdiv(V, meta["BV"]), NT, B * H)
diff --git a/vllm/model_executor/layers/fused_moe/activation.py b/vllm/model_executor/layers/fused_moe/activation.py
index 3112b3054fcd..b2e67e6220a9 100644
--- a/vllm/model_executor/layers/fused_moe/activation.py
+++ b/vllm/model_executor/layers/fused_moe/activation.py
@@ -15,6 +15,7 @@ class MoEActivation(Enum):
     # and produce output of shape [..., d]
     SILU = "silu"
     GELU = "gelu"
+    GELU_TANH = "gelu_tanh"
     RELU2 = "relu2"
     SWIGLUOAI = "swigluoai"
     SWIGLUSTEP = "swiglustep"
@@ -24,6 +25,7 @@ class MoEActivation(Enum):
     # NOTE: Non-gated activations require the "_no_mul" suffix to be present.
     SILU_NO_MUL = "silu_no_mul"
     GELU_NO_MUL = "gelu_no_mul"
+    GELU_TANH_NO_MUL = "gelu_tanh_no_mul"
     RELU2_NO_MUL = "relu2_no_mul"
 
     @property
@@ -53,6 +55,7 @@ def without_mul(self) -> "MoEActivation":
     @classmethod
     def from_str(cls, s: str) -> "MoEActivation":
         """Parse from string for backward compatibility."""
+        s = _STR_ALIASES.get(s, s)
         for member in cls:
             if member.value == s:
                 return member
@@ -61,20 +64,27 @@ def from_str(cls, s: str) -> "MoEActivation":
 
 
 # Module-level lookup tables used by MoEActivation functions.
+_STR_ALIASES: dict[str, str] = {
+    "gelu_pytorch_tanh": "gelu_tanh",
+}
+
 _CUSTOM_OP_NAMES: dict[MoEActivation, str] = {
     MoEActivation.SILU: "silu_and_mul",
     MoEActivation.GELU: "gelu_and_mul",
+    MoEActivation.GELU_TANH: "gelu_tanh_and_mul",
     MoEActivation.SWIGLUOAI: "swigluoai_and_mul",
     MoEActivation.SWIGLUSTEP: "swiglustep_and_mul",
     MoEActivation.RELU2: "relu2",
     MoEActivation.SILU_NO_MUL: "silu_and_mul",
     MoEActivation.GELU_NO_MUL: "gelu_and_mul",
+    MoEActivation.GELU_TANH_NO_MUL: "gelu_tanh_and_mul",
     MoEActivation.RELU2_NO_MUL: "relu2",
 }
 
 _WITHOUT_MUL: dict[MoEActivation, MoEActivation] = {
     MoEActivation.SILU: MoEActivation.SILU_NO_MUL,
     MoEActivation.GELU: MoEActivation.GELU_NO_MUL,
+    MoEActivation.GELU_TANH: MoEActivation.GELU_TANH_NO_MUL,
     MoEActivation.RELU2: MoEActivation.RELU2_NO_MUL,
 }
 
@@ -115,6 +125,8 @@ def apply_moe_activation(
         torch.ops._C.silu_and_mul(output, input)
     elif activation == MoEActivation.GELU:
         torch.ops._C.gelu_and_mul(output, input)
+    elif activation == MoEActivation.GELU_TANH:
+        torch.ops._C.gelu_tanh_and_mul(output, input)
     elif activation == MoEActivation.SWIGLUOAI:
         torch.ops._C.swigluoai_and_mul(output, input)
     elif activation == MoEActivation.SWIGLUSTEP:
@@ -127,6 +139,8 @@ def apply_moe_activation(
         output.copy_(F.silu(input))
     elif activation == MoEActivation.GELU_NO_MUL:
         output.copy_(F.gelu(input))
+    elif activation == MoEActivation.GELU_TANH_NO_MUL:
+        output.copy_(F.gelu(input, approximate="tanh"))
     elif activation == MoEActivation.RELU2_NO_MUL:
         F.relu(input, inplace=True)
         torch.square(input, out=output)
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 985f33e10098..c26090a56b2d 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -7,7 +7,12 @@
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
+from vllm._custom_ops import (
+    CPUQuantMethod,
+    cpu_fused_moe,
+    cpu_prepack_moe_weight,
+    fused_experts_cpu,
+)
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
@@ -195,23 +200,21 @@ def __call__(
             e_score_correction_bias=e_score_correction_bias,
         )
 
-        torch.ops._C.fused_experts_cpu(
+        return fused_experts_cpu(
             x,
             layer.w13_weight,
             layer.w2_weight,
             topk_weights,
             topk_ids,
-            True,
-            False,
-            False,
-            None,
-            None,
-            None,
-            None,
-            None,
-            True,
+            False,  # inplace
+            CPUQuantMethod.UNQUANT,  # moe_comp_method
+            None,  # w1_scale
+            None,  # w2_scale
+            None,  # w1_zero
+            None,  # w2_zero
+            None,  # block_size
+            True,  # is_vnni
         )
-        return x
 
 
 class CPUFusedMOE:
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
index a2d267bd7490..df69fa328ca7 100644
--- a/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_utils.py
@@ -140,6 +140,8 @@ def _fwd_kernel_ep_scatter_2(
     offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
     mask_s = offset_in_s < SCALE_HIDDEN_SIZE
 
+    output_tensor_stride0 = output_tensor_stride0.to(tl.int64)
+
     for token_id in range(start_token_id, total_token_num, grid_num):
         to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
         to_copy_s = tl.load(
@@ -154,12 +156,13 @@ def _fwd_kernel_ep_scatter_2(
 
             if expert_id >= 0:
                 dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
+                dest_token_index_i64 = dest_token_index.to(tl.int64)
                 tl.store(
                     output_index + token_id * output_index_stride0 + topk_index,
                     dest_token_index,
                 )
                 output_tensor_ptr = (
-                    output_tensor + dest_token_index * output_tensor_stride0
+                    output_tensor + dest_token_index_i64 * output_tensor_stride0
                 )
                 output_tensor_scale_ptr = (
                     output_tensor_scale + dest_token_index * output_tensor_scale_stride0
diff --git a/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py b/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py
new file mode 100644
index 000000000000..3906a7e057ca
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/experts/aiter_mxfp4_w4a8_moe.py
@@ -0,0 +1,292 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kFp8StaticTensorSym,
+    kMxfp4Static,
+)
+
+__all__ = [
+    "AiterW4A8ExpertsMonolithic",
+    "aiter_triton_kernel_w4a8_moe_forward",
+]
+
+
+def aiter_triton_kernel_w4a8_moe_forward(
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    activation: MoEActivation = MoEActivation.SWIGLUOAI,
+    quant_config: FusedMoEQuantConfig | None = None,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
+):
+    assert (
+        quant_config is not None
+        and quant_config.use_mxfp4_w4a8
+        and rocm_aiter_ops.is_enabled()
+    )
+    from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
+
+    routing_data, gather_idx, scatter_idx = aiter_routing(
+        gating_output, topk, sm_first=not renormalize
+    )
+    return triton_kernel_fused_mxfp4_w4a8_experts(
+        None,
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        activation=activation.value,
+        quant_config=quant_config,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        unpadded_N_w1=unpadded_N_w1,
+        unpadded_K_w1=unpadded_K_w1,
+        unpadded_N_w2=unpadded_N_w2,
+        unpadded_K_w2=unpadded_K_w2,
+    )
+
+
+def triton_kernel_fused_mxfp4_w4a8_experts(
+    output_tensor: torch.Tensor,
+    hidden_states: torch.Tensor,
+    w1,  # Tensor or triton_kernels.Tensor
+    w2,  # Tensor or triton_kernels.Tensor
+    routing_data,  # RoutingData
+    gather_indx,  # GatherIndx
+    scatter_indx,  # ScatterIndx
+    activation: str = "silu",
+    quant_config: FusedMoEQuantConfig | None = None,
+    swiglu_alpha: float = 1.702,
+    swiglu_limit: float = 7.0,
+    apply_router_weight_on_input: bool = False,
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    a1q_scale: torch.Tensor | None = None,
+    unpadded_N_w1=None,
+    unpadded_K_w1=None,
+    unpadded_N_w2=None,
+    unpadded_K_w2=None,
+) -> torch.Tensor:
+    assert quant_config is not None
+    # type check, uint8 means mxfp4
+    assert hidden_states.dtype == torch.bfloat16
+    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
+    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
+
+    # Shape check: weights are padded (e.g. hidden_size padded for
+    # GFX950 swizzle).
+    assert hidden_states.shape[-1] == w1.shape[-2]
+    assert w2.shape[-1] == w1.shape[1]
+
+    E, _, N = w1.shape
+
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    gammas = routing_data.gate_scal if routing_data else None
+
+    from aiter.ops.triton.moe_op_gemm_a8w4 import moe_gemm_a8w4
+    from aiter.ops.triton.quant_moe import downcast_to_static_fp8
+
+    assert quant_config.w1_precision is not None, (
+        "w1_precision in quant config can't be None"
+    )
+    assert quant_config.w2_precision is not None, (
+        "w2_precision in quant config can't be None"
+    )
+
+    hidden_states = downcast_to_static_fp8(
+        hidden_states, quant_config.w1_precision.flex_ctx.lhs_data.scale
+    )
+
+    intermediate_cache1 = moe_gemm_a8w4(
+        hidden_states,
+        w1.storage.data,
+        None,
+        quant_config.w1_precision.weight_scale.storage.data,
+        quant_config.w1_precision.flex_ctx.lhs_data.scale,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        quant_config.w1_bias,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=gammas if apply_router_weight_on_input else None,
+        swizzle_mx_scale="CDNA4_SCALE",
+        out_dtype=torch.float8_e4m3fn,
+        apply_swiglu=True,
+        alpha=swiglu_alpha,
+        limit=swiglu_limit,
+        unpadded_N=unpadded_N_w1,
+        unpadded_K=unpadded_K_w1,
+    )
+
+    intermediate_cache3 = moe_gemm_a8w4(
+        intermediate_cache1,
+        w2.storage.data,
+        None,
+        quant_config.w2_precision.weight_scale.storage.data,
+        quant_config.w2_precision.flex_ctx.lhs_data.scale,
+        None,
+        quant_config.w2_bias,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else gammas,
+        swizzle_mx_scale="CDNA4_SCALE",
+        unpadded_N=unpadded_N_w2,
+        unpadded_K=unpadded_K_w2,
+    )
+
+    return intermediate_cache3
+
+
+class AiterW4A8ExpertsMonolithic(mk.FusedMoEExpertsMonolithic):
+    """
+    Monolithic MXFP4 W4A8 expert using AITER triton kernels.
+
+    This backend uses:
+    - aiter.ops.triton.moe_routing.routing for routing
+    - aiter.ops.triton.moe_op_gemm_a8w4.moe_gemm_a8w4 for computation
+
+    Weight format: MXFP4 weights with GFX950 swizzle
+    Activation: Static FP8 quantization
+    """
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(moe_config, quant_config)
+        self.topk = moe_config.experts_per_token
+        self.renormalize = moe_config.routing_method in (
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        )
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        # Requires AITER and GFX950
+        if not rocm_aiter_ops.is_enabled():
+            return False
+        from vllm.platforms.rocm import on_gfx950
+
+        return on_gfx950()
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return False
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        # W4A8: MXFP4 weights with static FP8 activations
+        SUPPORTED_W_A = [
+            (kMxfp4Static, kFp8StaticTensorSym),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        # Only SILU activation (swiglu) is supported
+        return activation == MoEActivation.SWIGLUOAI
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return (
+            not moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.enable_eplb
+            and moe_parallel_config.dp_size <= 1
+        )
+
+    @staticmethod
+    def _supports_routing_method(
+        routing_method: RoutingMethodType,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return routing_method in [
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        ]
+
+    @staticmethod
+    def _supports_router_logits_dtype(
+        router_logits_dtype: torch.dtype | None,
+        routing_method: RoutingMethodType,
+    ) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return False  # Expert parallelism not yet supported
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        # grouped topk + fused topk bias parameters
+        num_expert_group: int | None = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        routed_scaling_factor: float | None = None,
+        topk_group: int | None = None,
+    ) -> torch.Tensor:
+        assert self.moe_config.intermediate_size_per_partition_unpadded is not None
+        assert self.moe_config.hidden_dim_unpadded is not None
+        return aiter_triton_kernel_w4a8_moe_forward(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            gating_output=router_logits,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            unpadded_N_w1=self.moe_config.intermediate_size_per_partition_unpadded * 2,
+            unpadded_K_w1=self.moe_config.hidden_dim_unpadded,
+            unpadded_N_w2=self.moe_config.hidden_dim_unpadded,
+            unpadded_K_w2=self.moe_config.intermediate_size_per_partition_unpadded,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/experts/cpu_moe.py b/vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
new file mode 100644
index 000000000000..3b3d5cdfae06
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/experts/cpu_moe.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""CPU FP8 W8A16 block-quantized fused MoE experts."""
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm._custom_ops import CPUQuantMethod, fused_experts_cpu
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kFp8Dynamic128Sym,
+    kFp8Static128BlockSym,
+)
+from vllm.platforms import current_platform
+
+
+def prepare_fp8_moe_layer_for_cpu(
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """VNNI-prepack FP8 MoE weights for CPU kernel."""
+    num_experts = w13.size(0)
+    packed_w13_list = []
+    packed_w2_list = []
+    for i in range(num_experts):
+        packed_w13_list.append(torch.ops._C.convert_weight_packed(w13[i]))
+        packed_w2_list.append(torch.ops._C.convert_weight_packed(w2[i]))
+    packed_w13 = torch.stack(packed_w13_list)
+    packed_w2 = torch.stack(packed_w2_list)
+    return packed_w13, packed_w2
+
+
+class CPUExpertsFp8(mk.FusedMoEExpertsMonolithic):
+    """CPU FP8 W8A16 block-quantized monolithic MoE experts."""
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(
+            moe_config,
+            quant_config,
+        )
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        return current_platform.is_cpu()
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return False
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation == MoEActivation.SILU
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return True
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        SUPPORTED_W_A = [
+            (kFp8Static128BlockSym, kFp8Dynamic128Sym),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
+
+    @staticmethod
+    def _supports_routing_method(
+        routing_method: RoutingMethodType,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return routing_method in [
+            RoutingMethodType.Default,
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        ]
+
+    @staticmethod
+    def _supports_router_logits_dtype(
+        router_logits_dtype: torch.dtype | None,
+        routing_method: RoutingMethodType,
+    ) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        # grouped topk + fused topk bias parameters
+        num_expert_group: int | None = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        routed_scaling_factor: float | None = None,
+        topk_group: int | None = None,
+    ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe.cpu_fused_moe import (
+            select_experts,
+        )
+
+        topk_weights, topk_ids = select_experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            use_grouped_topk=num_expert_group is not None,
+            top_k=self.moe_config.experts_per_token,
+            renormalize=self.moe_config.routing_method
+            in (
+                RoutingMethodType.Renormalize,
+                RoutingMethodType.RenormalizeNaive,
+            ),
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            scoring_func="softmax",
+            routed_scaling_factor=(
+                routed_scaling_factor if routed_scaling_factor is not None else 1.0
+            ),
+            e_score_correction_bias=e_score_correction_bias,
+        )
+
+        block_shape = (
+            list(self.quant_config.block_shape)
+            if self.quant_config.block_shape
+            else (
+                [self.quant_config._w1.shape.row, self.quant_config._w1.shape.col]
+                if self.quant_config._w1.shape is not None
+                else None
+            )
+        )
+
+        return fused_experts_cpu(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            False,  # inplace
+            CPUQuantMethod.FP8_W8A16,  # moe_comp_method
+            self.w1_scale,  # w1_scale
+            self.w2_scale,  # w2_scale
+            None,  # w1_zero
+            None,  # w2_zero
+            block_shape,  # block_size
+            True,  # is_vnni
+        )
diff --git a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
index ac317ac7762c..e10514debd08 100644
--- a/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/gpt_oss_triton_kernels_moe.py
@@ -5,7 +5,6 @@
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
-from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
@@ -286,35 +285,6 @@ def triton_kernel_moe_forward(
     unpadded_N_w2=None,
     unpadded_K_w2=None,
 ) -> torch.Tensor:
-    if (
-        quant_config is not None
-        and quant_config.use_mxfp4_w4a8
-        and rocm_aiter_ops.is_enabled()
-    ):
-        from aiter.ops.triton.moe_routing.routing import routing as aiter_routing
-
-        routing_data, gather_idx, scatter_idx = aiter_routing(
-            gating_output, topk, sm_first=not renormalize
-        )
-        return triton_kernel_fused_mxfp4_w4a8_experts(
-            None,
-            hidden_states,
-            w1,
-            w2,
-            routing_data,
-            gather_idx,
-            scatter_idx,
-            activation=activation.value,
-            quant_config=quant_config,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            unpadded_N_w1=unpadded_N_w1,
-            unpadded_K_w1=unpadded_K_w1,
-            unpadded_N_w2=unpadded_N_w2,
-            unpadded_K_w2=unpadded_K_w2,
-        )
-
     from triton_kernels.topk import topk as topk_fn
 
     sm_first = not renormalize
@@ -471,99 +441,6 @@ def triton_kernel_fused_experts(
     return output_tensor
 
 
-# This is a triton implementation of the fused_experts function
-def triton_kernel_fused_mxfp4_w4a8_experts(
-    output_tensor: torch.Tensor,
-    hidden_states: torch.Tensor,
-    w1,  # Tensor or triton_kernels.Tensor
-    w2,  # Tensor or triton_kernels.Tensor
-    routing_data,  # RoutingData
-    gather_indx,  # GatherIndx
-    scatter_indx,  # ScatterIndx
-    activation: str = "silu",
-    quant_config: FusedMoEQuantConfig | None = None,
-    swiglu_alpha: float = 1.702,
-    swiglu_limit: float = 7.0,
-    apply_router_weight_on_input: bool = False,
-    global_num_experts: int = -1,
-    expert_map: torch.Tensor | None = None,
-    a1q_scale: torch.Tensor | None = None,
-    unpadded_N_w1=None,
-    unpadded_K_w1=None,
-    unpadded_N_w2=None,
-    unpadded_K_w2=None,
-) -> torch.Tensor:
-    assert quant_config is not None
-    # type check, uint8 means mxfp4
-    assert hidden_states.dtype == torch.bfloat16
-    assert quant_config.w1_bias is None or quant_config.w1_bias.dtype == torch.float32
-    assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
-
-    # Shape check: weights are padded (e.g. hidden_size padded for
-    # GFX950 swizzle).
-    assert hidden_states.shape[-1] == w1.shape[-2]
-    assert w2.shape[-1] == w1.shape[1]
-
-    E, _, N = w1.shape
-
-    if global_num_experts == -1:
-        global_num_experts = E
-
-    gammas = routing_data.gate_scal if routing_data else None
-
-    from aiter.ops.triton.moe_op_gemm_a8w4 import moe_gemm_a8w4
-    from aiter.ops.triton.quant_moe import downcast_to_static_fp8
-
-    assert quant_config.w1_precision is not None, (
-        "w1_precision in quant config can't be None"
-    )
-    assert quant_config.w2_precision is not None, (
-        "w2_precision in quant config can't be None"
-    )
-
-    hidden_states = downcast_to_static_fp8(
-        hidden_states, quant_config.w1_precision.flex_ctx.lhs_data.scale
-    )
-
-    intermediate_cache1 = moe_gemm_a8w4(
-        hidden_states,
-        w1.storage.data,
-        None,
-        quant_config.w1_precision.weight_scale.storage.data,
-        quant_config.w1_precision.flex_ctx.lhs_data.scale,
-        quant_config.w2_precision.flex_ctx.lhs_data.scale,
-        quant_config.w1_bias,
-        routing_data,
-        gather_indx=gather_indx,
-        gammas=gammas if apply_router_weight_on_input else None,
-        swizzle_mx_scale="CDNA4_SCALE",
-        out_dtype=torch.float8_e4m3fn,
-        apply_swiglu=True,
-        alpha=swiglu_alpha,
-        limit=swiglu_limit,
-        unpadded_N=unpadded_N_w1,
-        unpadded_K=unpadded_K_w1,
-    )
-
-    intermediate_cache3 = moe_gemm_a8w4(
-        intermediate_cache1,
-        w2.storage.data,
-        None,
-        quant_config.w2_precision.weight_scale.storage.data,
-        quant_config.w2_precision.flex_ctx.lhs_data.scale,
-        None,
-        quant_config.w2_bias,
-        routing_data,
-        scatter_indx=scatter_indx,
-        gammas=None if apply_router_weight_on_input else gammas,
-        swizzle_mx_scale="CDNA4_SCALE",
-        unpadded_N=unpadded_N_w2,
-        unpadded_K=unpadded_K_w2,
-    )
-
-    return intermediate_cache3
-
-
 def make_routing_data(
     topk_ids: torch.Tensor,
     topk_weights: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py b/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
index e10be4af8680..d6bd2b140087 100644
--- a/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
@@ -62,7 +62,7 @@ def _supports_current_device() -> bool:
 
     @staticmethod
     def _supports_no_act_and_mul() -> bool:
-        return False
+        return True
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
@@ -70,6 +70,7 @@ def _supports_activation(activation: MoEActivation) -> bool:
             MoEActivation.SILU,
             MoEActivation.GELU,
             MoEActivation.SWIGLUOAI,
+            MoEActivation.RELU2_NO_MUL,
         ]
 
     @staticmethod
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index bd54cd636b00..8638a11466c0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -786,9 +786,11 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py b/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
index 6a2417cd4d31..5876f1c87ed6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_humming_moe.py
@@ -4,7 +4,7 @@
 
 import json
 import math
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 from humming import dtypes
@@ -16,7 +16,11 @@
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
-from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+)
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size,
 )
@@ -34,21 +38,16 @@
 from vllm.platforms import current_platform
 from vllm.v1.worker.workspace import current_workspace_manager
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization.humming import HummingMoEMethod
-
-
 logger = init_logger(__name__)
 
 
 def get_humming_moe_gemm_type() -> str:
     env_gemm_type: str = envs.VLLM_HUMMING_MOE_GEMM_TYPE or ""
     env_gemm_type = env_gemm_type.lower()
-    if env_gemm_type in ["indexed", "grouped"]:
+    if env_gemm_type == "indexed":
         gemm_type = env_gemm_type
-    elif current_platform.has_device_capability(90):
-        # for device that supports TMA, use grouped gemm
-        gemm_type = "grouped"
+    elif env_gemm_type in ["grouped_contiguous", "grouped"]:
+        gemm_type = "grouped_contiguous"
     else:
         gemm_type = "indexed"
 
@@ -60,49 +59,44 @@ class HummingExpertsBase(mk.FusedMoEExpertsModular):
     def __init__(
         self,
         layer: torch.nn.Module,
-        quant_method: "HummingMoEMethod",
-        prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular | None = None,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+        max_num_tokens: int | None = None,
+        num_dispatchers: int | None = None,
     ):
         self.layer = layer
         self.num_experts = self.layer.num_experts
         self.global_num_experts = self.layer.global_num_experts
         self.init_humming_moe()
 
-        if prepare_finalize is not None:
-            max_num_tokens: int | None = None
-            num_dispatchers: int | None = None
-            if self.is_batched:
-                max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
-                num_dispatchers = prepare_finalize.num_dispatchers()
-
-            assert quant_method.moe_quant_config is not None
-            super().__init__(
-                moe_config=quant_method.moe,
-                quant_config=quant_method.moe_quant_config,
-                max_num_tokens=max_num_tokens,
-                num_dispatchers=num_dispatchers,
-            )
-        else:
-            assert not self.is_batched
+        if self.is_batched():
+            assert max_num_tokens is not None and num_dispatchers is not None
+
+        super().__init__(
+            moe_config=moe_config,
+            quant_config=quant_config,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=num_dispatchers,
+        )
 
     def init_humming_moe(self):
         self.compute_config = {
             "use_batch_invariant": envs.VLLM_BATCH_INVARIANT,
             "use_f16_accum": envs.VLLM_HUMMING_USE_F16_ACCUM,
-            "gemm_type": self.humming_gemm_type.value,
+            "gemm_type": self.humming_gemm_type().value,
         }
         self.w13_tuning_config = HummingMethod.get_default_tuning_configs(
             layer=self.layer,
             use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
             use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
-            gemm_type=self.humming_gemm_type,
+            gemm_type=self.humming_gemm_type(),
             sublayer_name="w13",
         )
         self.w2_tuning_config = HummingMethod.get_default_tuning_configs(
             layer=self.layer,
             use_f16_accum=envs.VLLM_HUMMING_USE_F16_ACCUM,
             use_batch_invariant=envs.VLLM_BATCH_INVARIANT,
-            gemm_type=self.humming_gemm_type,
+            gemm_type=self.humming_gemm_type(),
             sublayer_name="w2",
         )
         self.compute_config_str = json.dumps(self.compute_config)
@@ -124,13 +118,13 @@ def estimate_local_valid_shape_m(self, topk_ids: torch.Tensor):
         global_num_experts = self.global_num_experts
         return math.ceil(global_valid_shape_m * num_experts / global_num_experts)
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         raise NotImplementedError
 
-    @property
-    def is_batched(self) -> bool:
-        return self.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts
+    @classmethod
+    def is_batched(cls) -> bool:
+        return cls.activation_format() == mk.FusedMoEActivationFormat.BatchedExperts
 
     @staticmethod
     def _supports_quant_scheme(
@@ -158,10 +152,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
@@ -189,7 +185,7 @@ def moe_problem_size(
         assert w1.size(0) == num_experts
         assert w2.size(0) == num_experts
 
-        if not self.is_batched:
+        if not self.is_batched():
             num_tokens = a1.size(0)
             assert topk_ids.size(0) == num_tokens
         else:
@@ -201,7 +197,7 @@ def moe_problem_size(
 
     def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
         num_experts = self.num_experts
-        N = self.layer.intermediate_size
+        N = self.layer.intermediate_size_per_partition
         K = self.layer.hidden_size
         assert isinstance(num_experts, int)
         assert isinstance(N, int)
@@ -218,7 +214,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
         # The output must be derived from workspace1.
 
         output_shape: tuple[int, ...]
-        if self.is_batched:
+        if self.is_batched():
             max_num_tokens = self.max_num_tokens
             num_dispatchers = self.num_dispatchers
             assert max_num_tokens is not None and num_dispatchers is not None
@@ -227,7 +223,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
             output_shape = (num_experts, max_num_tokens * num_dispatchers, K)
         else:
             input_shape_m = M
-            if self.humming_gemm_type != HummingGemmType.INDEXED:
+            if self.humming_gemm_type() != HummingGemmType.INDEXED:
                 input_shape_m = M * topk
             real_shape_m = M * topk
             output_shape = (M, K)
@@ -262,7 +258,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
                 "dtype": torch_dtype_map[a_dtype],
             },
             "down_output": {
-                "shape": output_shape if self.is_batched else (real_shape_m, K),
+                "shape": output_shape if self.is_batched() else (real_shape_m, K),
                 "dtype": torch_dtype_map[c_dtype],
             },
             "output": {
@@ -288,7 +284,7 @@ def get_buffer_metas(self, M: int, topk: int, activation: MoEActivation):
             ]
 
         # batched moe use down_output as output
-        if not self.is_batched:
+        if not self.is_batched():
             required_buffers.append("output")
 
         return buffer_metas, required_buffers
@@ -308,7 +304,7 @@ def _workspace_shapes(self, M: int, topk: int, activation: MoEActivation):
             else:
                 workspace2_nbytes = max(workspace2_nbytes, nbytes)
 
-        output_key = "down_output" if self.is_batched else "output"
+        output_key = "down_output" if self.is_batched() else "output"
         output_shape = buffer_metas[output_key]["shape"]
 
         return (workspace1_nbytes // 2,), (workspace2_nbytes // 2,), output_shape
@@ -395,6 +391,33 @@ def main_apply(
     ):
         raise NotImplementedError
 
+    @staticmethod
+    def is_supported_config(
+        cls: type[mk.FusedMoEExperts],
+        moe_config: FusedMoEConfig,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+        activation_format: mk.FusedMoEActivationFormat,
+    ) -> tuple[bool, str | None]:
+        if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
+            supported = cls.activation_format() == activation_format
+            reason = "activation_format mismatched"
+        elif activation_format == mk.FusedMoEActivationFormat.Standard:
+            if cls.activation_format() != mk.FusedMoEActivationFormat.Standard:
+                supported = False
+                reason = "activation_format mismatched"
+            else:
+                assert hasattr(cls, "humming_gemm_type")
+                gemm_type = cls.humming_gemm_type().value.lower()
+                preferred_gemm_type = get_humming_moe_gemm_type().lower()
+                supported = preferred_gemm_type == gemm_type
+                reason = "preferred gemm type mismatched"
+        else:
+            supported = False
+            reason = "unsupported activation_format"
+
+        return supported, None if supported else reason
+
 
 class HummingIndexedExperts(HummingExpertsBase):
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
@@ -404,8 +427,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.INDEXED
 
     def prepare_humming_moe_kwargs(
@@ -526,8 +549,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.GROUPED_CONTIGUOUS
 
     def main_apply(
@@ -619,8 +642,8 @@ def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.BatchedExperts
 
-    @property
-    def humming_gemm_type(self) -> HummingGemmType:
+    @staticmethod
+    def humming_gemm_type() -> HummingGemmType:
         return HummingGemmType.GROUPED_MASKED
 
     def main_apply(
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index ebd330197099..3487ac1766e6 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -613,10 +613,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 7e7bcc709921..1a655934259a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1941,10 +1941,12 @@ def _supports_activation(activation: MoEActivation) -> bool:
         return activation in [
             MoEActivation.SILU,
             MoEActivation.GELU,
+            MoEActivation.GELU_TANH,
             MoEActivation.SWIGLUOAI,
             MoEActivation.SWIGLUSTEP,
             MoEActivation.SILU_NO_MUL,
             MoEActivation.GELU_NO_MUL,
+            MoEActivation.GELU_TANH_NO_MUL,
             MoEActivation.RELU2_NO_MUL,
         ]
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 577f9a986790..49586a6750d9 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -246,6 +246,9 @@ class FusedMoE(PluggableLayer):
                                       not supported by the router (or the experts).
     """
 
+    # Auto-incrementing layer ID for routing replay buffer binding.
+    _next_moe_layer_id: int = 0
+
     # --8<-- [end:fused_moe]
 
     def __init__(
@@ -290,6 +293,10 @@ def __init__(
     ):
         super().__init__()
 
+        # Assign unique layer ID for routing replay buffer binding.
+        self.moe_layer_id = FusedMoE._next_moe_layer_id
+        FusedMoE._next_moe_layer_id += 1
+
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
@@ -538,9 +545,11 @@ def _get_quant_method() -> FusedMoEMethodBase:
         # for heuristic purposes, so it must be initialized first.
         self.quant_method: FusedMoEMethodBase = _get_quant_method()
 
-        if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
+        if not self.moe_config.is_act_and_mul and not (
+            current_platform.is_cuda_alike() or current_platform.is_xpu()
+        ):
             raise NotImplementedError(
-                "is_act_and_mul=False is supported only for CUDA and ROCm for now"
+                "is_act_and_mul=False is supported only for CUDA and XPU for now"
             )
 
         if self.enable_eplb and not self.quant_method.supports_eplb:
@@ -1103,9 +1112,6 @@ def weight_loader(
         return_success: bool = False,
     ) -> bool | None:
         quant_config_name = self.quant_config and self.quant_config.get_name()
-        if quant_config_name == "humming":
-            assert hasattr(self.quant_method, "weight_schema")
-            quant_config_name = self.quant_method.weight_schema.quant_method
         if quant_config_name == "gpt_oss_mxfp4":
             # (FIXME) for gpt-oss all experts are combined
             if "bias" in weight_name:
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index b0f967085ae4..135a677ffde2 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -1215,6 +1215,7 @@ def _fused_experts(
         expert_map: torch.Tensor | None,
         apply_router_weight_on_input: bool,
         expert_tokens_meta: ExpertTokensMetadata | None,
+        output_alias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         _, M_full, N, K, top_k = self.fused_experts.moe_problem_size(
             a1q, w1, w2, topk_ids
@@ -1243,6 +1244,23 @@ def _fused_experts(
             activation,
         )
 
+        # If caller's output buffer already matches fused_out shape/dtype, alias
+        # to skip the redundant copy in TopKWeightAndReduceNoOP.apply downstream.
+        # This eliminates ~94% of __amd_rocclr_copyBuffer events (Copy 2 of the
+        # double-copy MoE write-back path).
+        if current_platform.is_rocm():
+            from vllm._aiter_ops import rocm_aiter_ops
+
+            if (
+                rocm_aiter_ops.is_fused_moe_enabled()
+                and output_alias is not None
+                and output_alias.shape == fused_out.shape
+                and output_alias.dtype == fused_out.dtype
+                and output_alias.device == fused_out.device
+                and output_alias.is_contiguous()
+            ):
+                fused_out = output_alias
+
         self.fused_experts.apply(
             output=fused_out,
             hidden_states=a1q,
@@ -1403,6 +1421,7 @@ def apply(
             expert_map=expert_map,
             apply_router_weight_on_input=apply_router_weight_on_input,
             expert_tokens_meta=expert_tokens_meta,
+            output_alias=output,
         )
 
         return self._finalize(
diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
index 74be17eaa55f..7ce0c58b7112 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py
@@ -55,6 +55,7 @@ class Fp8MoeBackend(Enum):
     VLLM_CUTLASS = "VLLM_CUTLASS"
     BATCHED_VLLM_CUTLASS = "BATCHED_VLLM_CUTLASS"
     XPU = "XPU"
+    CPU = "CPU"
 
 
 def _get_priority_backends(
@@ -80,6 +81,7 @@ def _get_priority_backends(
         Fp8MoeBackend.BATCHED_VLLM_CUTLASS,
         Fp8MoeBackend.BATCHED_TRITON,
         Fp8MoeBackend.XPU,
+        Fp8MoeBackend.CPU,
     ]
 
     def _move_to_front(backends: list[Fp8MoeBackend], backend: Fp8MoeBackend) -> None:
@@ -102,6 +104,10 @@ def _move_to_front(backends: list[Fp8MoeBackend], backend: Fp8MoeBackend) -> Non
         # move XPU backend to the front.
         _move_to_front(_AVAILABLE_BACKENDS, Fp8MoeBackend.XPU)
 
+    if current_platform.is_cpu():
+        # CPU platform uses FP8 W8A16 fused MoE kernel.
+        _move_to_front(_AVAILABLE_BACKENDS, Fp8MoeBackend.CPU)
+
     return _AVAILABLE_BACKENDS
 
 
@@ -186,6 +192,13 @@ def backend_to_kernel_cls(
 
         return [XPUExpertsFp8]
 
+    elif backend == Fp8MoeBackend.CPU:
+        from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
+            CPUExpertsFp8,
+        )
+
+        return [CPUExpertsFp8]
+
     else:
         raise ValueError(f"Unknown FP8 MoE backend: {backend.value}")
 
@@ -472,6 +485,12 @@ def convert_to_fp8_moe_kernel_format(
         )
 
         w13, w2 = prepare_fp8_moe_layer_for_xpu(w13, w2)
+    elif fp8_backend == Fp8MoeBackend.CPU:
+        from vllm.model_executor.layers.fused_moe.experts.cpu_moe import (
+            prepare_fp8_moe_layer_for_cpu,
+        )
+
+        w13, w2 = prepare_fp8_moe_layer_for_cpu(w13, w2)
     else:
         if fp8_backend not in [
             Fp8MoeBackend.TRITON,
@@ -516,6 +535,14 @@ def make_fp8_moe_quant_config(
             block_shape=block_shape,
         )
 
+    # CPU is mixed precision W8A16 config.
+    if fp8_backend == Fp8MoeBackend.CPU:
+        return fp8_w8a16_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            block_shape=block_shape,
+        )
+
     # Flashinfer CUTLASS per-tensor uses single dq scale
     # (alpha = w_scale * a_scale) and inverse a2 scale.
     if fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS and block_shape is None:
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
index c1423362d737..7c596d52a653 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -18,7 +18,9 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     FusedMoEQuantDesc,
+    RoutingMethodType,
     mxfp4_mxfp8_moe_quant_config,
+    mxfp4_w4a8_moe_quant_config,
     mxfp4_w4a16_moe_quant_config,
     ocp_mx_moe_quant_config,
 )
@@ -26,9 +28,11 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8Dynamic128Sym,
+    kFp8StaticTensorSym,
     kMxfp4Static,
     kMxfp8Dynamic,
 )
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import all_close_1d
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.math_utils import round_up
@@ -59,8 +63,11 @@ class Mxfp4MoeBackend(Enum):
     # Marlin
     BATCHED_MARLIN = "BATCHED_MARLIN"
     MARLIN = "MARLIN"
-    # ROCm AITER
-    AITER = "AITER"
+    # ROCm AITER backends
+    AITER_MXFP4_BF16 = "AITER_MXFP4_BF16"  # W4A16: CK kernel
+    # Keep the legacy name as an alias while the ROCm split backend rename settles.
+    AITER = "AITER_MXFP4_BF16"
+    AITER_MXFP4_FP8 = "AITER_MXFP4_FP8"  # W4A8: triton kernel
     # Triton
     TRITON = "TRITON"
     TRITON_UNFUSED = "TRITON_UNFUSED"
@@ -68,6 +75,15 @@ class Mxfp4MoeBackend(Enum):
     XPU = "XPU"
     # Emulation
     EMULATION = "EMULATION"
+    # Humming
+    HUMMING = "HUMMING"
+
+
+# AITER backends group
+AITER_BACKENDS = (
+    Mxfp4MoeBackend.AITER_MXFP4_BF16,
+    Mxfp4MoeBackend.AITER_MXFP4_FP8,
+)
 
 
 # Backends that share the same TRTLLM weight format
@@ -130,6 +146,19 @@ def backend_to_kernel_cls(
 
         return [UnfusedOAITritonExperts]
 
+    elif backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.fused_moe.fused_humming_moe import (
+            BatchedHummingGroupedExperts,
+            HummingGroupedExperts,
+            HummingIndexedExperts,
+        )
+
+        return [
+            BatchedHummingGroupedExperts,
+            HummingGroupedExperts,
+            HummingIndexedExperts,
+        ]
+
     elif backend == Mxfp4MoeBackend.MARLIN:
         from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
             MarlinExperts,
@@ -144,13 +173,20 @@ def backend_to_kernel_cls(
 
         return [BatchedMarlinExperts]
 
-    elif backend == Mxfp4MoeBackend.AITER:
+    elif backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
             AiterExperts,
         )
 
         return [AiterExperts]
 
+    elif backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        from vllm.model_executor.layers.fused_moe.experts.aiter_mxfp4_w4a8_moe import (
+            AiterW4A8ExpertsMonolithic,
+        )
+
+        return [AiterW4A8ExpertsMonolithic]
+
     elif backend == Mxfp4MoeBackend.XPU:
         from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
 
@@ -177,8 +213,10 @@ def map_mxfp4_backend(runner_backend: MoEBackend) -> Mxfp4MoeBackend:
         "flashinfer_cutlass_afp8": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
         "triton": Mxfp4MoeBackend.TRITON,
         "triton_unfused": Mxfp4MoeBackend.TRITON_UNFUSED,
+        "humming": Mxfp4MoeBackend.HUMMING,
         "marlin": Mxfp4MoeBackend.MARLIN,
-        "aiter": Mxfp4MoeBackend.AITER,
+        "aiter": Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        "aiter_mxfp4_fp8": Mxfp4MoeBackend.AITER_MXFP4_FP8,
         "xpu": Mxfp4MoeBackend.XPU,
         "emulation": Mxfp4MoeBackend.EMULATION,
     }
@@ -197,7 +235,8 @@ def _get_priority_backends_for_gpt_oss() -> list[Mxfp4MoeBackend]:
     """
     _AVAILABLE_BACKENDS = [
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
-        Mxfp4MoeBackend.AITER,
+        Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        Mxfp4MoeBackend.AITER_MXFP4_FP8,
         Mxfp4MoeBackend.TRITON,
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
         # TRITON_UNFUSED has bug with MTP support
@@ -217,6 +256,8 @@ def _get_priority_backends() -> list[Mxfp4MoeBackend]:
     TRTLLM MXFP8; SM90 falls through to Triton_unfused or Marlin (the
     backend-level ``is_supported_config`` check filters by device capability).
     """
+    if current_platform.is_rocm():
+        return [Mxfp4MoeBackend.AITER_MXFP4_BF16]
     _AVAILABLE_BACKENDS = [
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
         Mxfp4MoeBackend.DEEPGEMM_MXFP4,
@@ -238,16 +279,28 @@ def _backend_activation_key(backend: Mxfp4MoeBackend) -> QuantKey | None:
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
     ):
         return kMxfp8Dynamic
-    return None
+    if backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        return kFp8StaticTensorSym
+    return None  # BF16 activation
 
 
-def select_gpt_oss_mxfp4_moe_backend(
+def select_mxfp4_moe_backend(
     config: FusedMoEConfig,
+    activation_key: QuantKey | None = None,
 ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
     """
     Select the primary MXFP4 MoE backend.
+
+    Args:
+        config: MoE configuration
+        activation_key: Optional activation quantization key. If provided,
+            overrides the default activation key for backend selection.
+            Use kFp8StaticTensorSym for W4A8 scheme.
+
     Note: Shape-specific fallbacks may still occur at runtime.
     """
+    # If activation_key is explicitly provided (e.g., W4A8), use it
+    requested_activation_key = activation_key
     device_capability = current_platform.get_device_capability()
     triton_kernels_supported = (
         has_triton_kernels()
@@ -316,11 +369,17 @@ def _return_or_raise(
             and requested_backend == Mxfp4MoeBackend.MARLIN
         ):
             requested_backend = Mxfp4MoeBackend.BATCHED_MARLIN
+        # Use requested_activation_key if provided, otherwise use backend default
+        act_key = (
+            requested_activation_key
+            if requested_activation_key is not None
+            else _backend_activation_key(requested_backend)
+        )
         return _return_or_raise(
             requested_backend,
             config,
             kMxfp4Static,
-            _backend_activation_key(requested_backend),
+            act_key,
             activation_format,
         )
 
@@ -392,10 +451,15 @@ def _return_or_raise(
         )
 
     for backend in AVAILABLE_BACKENDS:
-        activation_key = _backend_activation_key(backend)
+        # Use requested_activation_key if provided, otherwise use backend default
+        act_key = (
+            requested_activation_key
+            if requested_activation_key is not None
+            else _backend_activation_key(backend)
+        )
         for k_cls in backend_to_kernel_cls(backend):
             supported, reason = k_cls.is_supported_config(
-                k_cls, config, kMxfp4Static, activation_key, activation_format
+                k_cls, config, kMxfp4Static, act_key, activation_format
             )
             if supported:
                 logger.info_once(_make_log_backend(backend))
@@ -422,7 +486,7 @@ def _return_or_raise(
     return Mxfp4MoeBackend.NONE, None
 
 
-def select_mxfp4_moe_backend(
+def select_deepseek_v4_mxfp4_moe_backend(
     config: FusedMoEConfig,
 ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
     """
@@ -484,8 +548,22 @@ def _return_or_raise(
             activation_format,
         )
 
+    # DeepSeek-V4 on ROCm is more accurate with the unfused Triton MXFP4 path
+    # than the default AITER path. Prefer Triton-unfused for this routing mode,
+    # while keeping AITER as a fallback if Triton-unfused rejects the config.
+    if (
+        current_platform.is_rocm()
+        and config.routing_method == RoutingMethodType.DeepseekV4
+    ):
+        priority_backends = [
+            Mxfp4MoeBackend.TRITON_UNFUSED,
+            Mxfp4MoeBackend.AITER_MXFP4_BF16,
+        ]
+    else:
+        priority_backends = _get_priority_backends()
+
     # Iterate priority backends: TRTLLM MXFP8, then Triton.
-    for backend in _get_priority_backends():
+    for backend in priority_backends:
         activation_key = _backend_activation_key(backend)
         for k_cls in backend_to_kernel_cls(backend):
             supported, reason = k_cls.is_supported_config(
@@ -573,7 +651,21 @@ def convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
 
     sf_block_size = 32  # mxfp4 block size
 
-    if mxfp4_backend in (
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            prepare_humming_moe_layer,
+        )
+
+        prepare_humming_moe_layer(layer, {"quant_method": "gpt_oss_mxfp4"})
+        return (
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+        )
+    elif mxfp4_backend in (
         Mxfp4MoeBackend.MARLIN,
         Mxfp4MoeBackend.BATCHED_MARLIN,
     ):
@@ -806,7 +898,7 @@ def _interleave_mxfp4_cutlass_sm90(w):
                 w2_bias,
             )
 
-    elif mxfp4_backend == Mxfp4MoeBackend.AITER:
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
         from vllm._aiter_ops import rocm_aiter_ops
 
         if w13_bias is not None:
@@ -868,6 +960,63 @@ def _interleave_mxfp4_cutlass_sm90(w):
             w2_bias,
         )
 
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        # W4A8: MXFP4 weights + static FP8 activations (triton kernel)
+        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+        from triton_kernels.numerics import InFlexData
+
+        if w13_bias is not None:
+            w13_bias = w13_bias.to(torch.float32)
+        if w2_bias is not None:
+            w2_bias = w2_bias.to(torch.float32)
+
+        # Process static FP8 input scales (reduce to scalar, warn if not uniform)
+        w13_input_scale = layer.w13_input_scale
+        w2_input_scale = layer.w2_input_scale
+        if w13_input_scale is None or w2_input_scale is None:
+            raise ValueError(
+                "W4A8 (AITER_MXFP4_FP8) requires static input scales, but found "
+                "w13_input_scale or w2_input_scale is None."
+            )
+        if not all_close_1d(w13_input_scale) or not all_close_1d(w2_input_scale):
+            logger.warning_once(
+                "Found input_scales that are not equal for "
+                "fp8 MoE layer. Using the maximum across experts "
+                "for each layer."
+            )
+        w13_input_scale = w13_input_scale.max().to(torch.float32)
+        w2_input_scale = w2_input_scale.max().to(torch.float32)
+
+        # Swizzle weights for GFX950
+        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(w13_weight, w13_weight_scale)
+        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(w2_weight, w2_weight_scale)
+
+        # Create InFlexData for activation scales
+        lhs_data13 = InFlexData(scale=w13_input_scale)
+        lhs_data2 = InFlexData(scale=w2_input_scale)
+
+        # Create PrecisionConfig with both weight and activation info
+        w13_precision_config = PrecisionConfig(
+            weight_scale=w13_scale,
+            flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
+        )
+        w2_precision_config = PrecisionConfig(
+            weight_scale=w2_scale,
+            flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
+        )
+
+        del layer.w13_weight
+        del layer.w2_weight
+
+        return (
+            w13_weight,
+            w2_weight,
+            w13_precision_config,
+            w2_precision_config,
+            w13_bias,
+            w2_bias,
+        )
+
     elif mxfp4_backend in TRITON_BACKENDS:
         from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -970,6 +1119,21 @@ def convert_weight_to_mxfp4_moe_kernel_format(
             w2_bias,
         )
 
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            prepare_humming_moe_layer,
+        )
+
+        prepare_humming_moe_layer(layer, {"quant_method": "mxfp4"})
+        return (
+            layer.w13_weight,
+            layer.w2_weight,
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            getattr(layer, "w13_bias", None),
+            getattr(layer, "w2_bias", None),
+        )
+
     if mxfp4_backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
         from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
             prepare_moe_mxfp4_layer_for_marlin,
@@ -1107,6 +1271,64 @@ def convert_weight_to_mxfp4_moe_kernel_format(
             w2_bias,
         )
 
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_BF16:
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        if w13_bias is not None:
+            w13_bias = w13_bias.data.to(torch.float32)
+        if w2_bias is not None:
+            w2_bias = w2_bias.data.to(torch.float32)
+
+        e, n, k = w13_weight.shape
+
+        w13_weight.view(torch.uint8).copy_(
+            w13_weight.data.view(torch.uint8)
+            .view(e, n // 2, 2, k)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, k)
+        )
+        w13_weight_scale.data = (
+            w13_weight_scale.data.view(e, n // 2, 2, -1)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, -1)
+        )
+
+        w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2)
+        w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2)
+
+        w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True)
+        shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w13_weight_scale.view(-1, w13_weight_scale.shape[-1]),
+            num_experts,
+            True,
+        )
+
+        w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False)
+        shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w2_weight_scale.view(-1, w2_weight_scale.shape[-1]),
+            num_experts,
+            False,
+        )
+
+        if w13_bias is not None:
+            w13_bias = (
+                w13_bias.data.view(-1, n // 2, 2)
+                .permute(0, 2, 1)
+                .contiguous()
+                .view(-1, n)
+            )
+
+        return (
+            w13_weight,
+            w2_weight,
+            shuffled_w13_scale,
+            shuffled_w2_scale,
+            w13_bias,
+            w2_bias,
+        )
+
     elif mxfp4_backend in TRITON_BACKENDS:
         from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 
@@ -1162,7 +1384,7 @@ def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
     else:
         raise ValueError(
             f"Unsupported mxfp4_backend for Mxfp4MoEMethod: {mxfp4_backend}. "
-            f"Expected TRTLLM or Triton backend."
+            f"Expected TRTLLM, Triton, or AITER backend."
         )
 
 
@@ -1175,6 +1397,9 @@ def make_mxfp4_moe_quant_config(
     swiglu_limit: float | None = None,
     w1_bias: torch.Tensor | None = None,
     w2_bias: torch.Tensor | None = None,
+    a1_scale: torch.Tensor | None = None,
+    a2_scale: torch.Tensor | None = None,
+    layer: torch.nn.Module | None = None,
 ) -> FusedMoEQuantConfig | None:
     """Create a FusedMoEQuantConfig for the given MXFP4 backend."""
     if mxfp4_backend == Mxfp4MoeBackend.DEEPGEMM_MXFP4:
@@ -1216,6 +1441,17 @@ def make_mxfp4_moe_quant_config(
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
         )
+    elif mxfp4_backend == Mxfp4MoeBackend.AITER_MXFP4_FP8:
+        # W4A8: MXFP4 weights + static FP8 activations
+        return mxfp4_w4a8_moe_quant_config(
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            block_shape=None,
+        )
     elif mxfp4_backend in (
         Mxfp4MoeBackend.MARLIN,
         Mxfp4MoeBackend.BATCHED_MARLIN,
@@ -1223,7 +1459,7 @@ def make_mxfp4_moe_quant_config(
         Mxfp4MoeBackend.TRITON_UNFUSED,
         Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
         Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
-        Mxfp4MoeBackend.AITER,
+        Mxfp4MoeBackend.AITER_MXFP4_BF16,
     ):
         return mxfp4_w4a16_moe_quant_config(
             w1_bias=w1_bias,
@@ -1234,6 +1470,14 @@ def make_mxfp4_moe_quant_config(
             gemm1_beta=gemm1_beta,
             gemm1_clamp_limit=swiglu_limit,
         )
+    elif mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            get_humming_moe_quant_config,
+        )
+
+        assert isinstance(layer, FusedMoE)
+        return get_humming_moe_quant_config(layer)
     else:
         return ocp_mx_moe_quant_config(
             quant_dtype="mxfp4",
@@ -1254,6 +1498,7 @@ def make_mxfp4_moe_kernel(
     mxfp4_backend: Mxfp4MoeBackend,
     routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     shared_experts: torch.nn.Module | None = None,
+    layer: torch.nn.Module | None = None,
 ) -> mk.FusedMoEKernel:
     """Create a FusedMoEKernel for the given MXFP4 backend."""
     is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)
@@ -1269,6 +1514,11 @@ def make_mxfp4_moe_kernel(
 
     logger.info_once("Using %s", prepare_finalize.__class__.__name__)
 
+    extra_kwargs = {}
+    if mxfp4_backend == Mxfp4MoeBackend.HUMMING:
+        assert layer is not None
+        extra_kwargs["layer"] = layer
+
     # Create Experts.
     if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
         max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
@@ -1278,11 +1528,13 @@ def make_mxfp4_moe_kernel(
             quant_config=moe_quant_config,
             max_num_tokens=max_num_tokens,
             num_dispatchers=prepare_finalize.num_dispatchers(),
+            **extra_kwargs,
         )
     else:
         experts = experts_cls(
             moe_config=moe_config,
             quant_config=moe_quant_config,
+            **extra_kwargs,
         )
 
     kernel = mk.FusedMoEKernel(
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
index 01ac5cfa9da7..f4796243e013 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -381,7 +381,7 @@ def convert_to_nvfp4_moe_kernel_format(
     elif nvfp4_backend == NvFp4MoeBackend.EMULATION:
         # Move the E2M1 lookup table to the device now, because
         # `.to(device)` is not allowed during CUDA graph capture.
-        kE2M1ToFloat_handle.val = kE2M1ToFloat_handle.val.to(layer.weight.device)
+        kE2M1ToFloat_handle.val = kE2M1ToFloat_handle.val.to(w13.device)
 
         if a13_scale is None or a2_scale is None:
             raise ValueError(
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index d9d888296b74..830309496d9f 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import disable_inplace
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8Dynamic128Sym,
@@ -438,4 +439,16 @@ def apply(
             num_local_tokens=num_local_tokens,
             output_dtype=output.dtype,
         )
-        output.copy_(result)
+        # avoid redundant copy when output is a view of the result
+        if (
+            output.shape == result.shape
+            and output.dtype == result.dtype
+            and output.device == result.device
+            and output.is_contiguous()
+            and result.is_contiguous()
+            and output._base is None
+            and disable_inplace()
+        ):
+            output.set_(result)
+        else:
+            output.copy_(result)
diff --git a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
index bf4d02782a11..e9d1a8f22008 100644
--- a/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
+++ b/vllm/model_executor/layers/fused_moe/routed_experts_capturer.py
@@ -1,371 +1,833 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-# Adapted from
-# https://github.com/sgl-project/sglang/blob/bed301a5acaa9577c9aa706468bdf242f6a43051/python/sglang/srt/layers/moe/routed_experts_capturer.py
-
 from __future__ import annotations
 
-import fcntl
+import contextlib
 import logging
-import os
-import tempfile
-from collections.abc import Generator
-from contextlib import contextmanager
-from multiprocessing import shared_memory
-from unittest.mock import patch
+from abc import ABC, abstractmethod
 
 import numpy as np
 import torch
+import torch.distributed
 
-from vllm.config import VllmConfig
-from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.forward_context import get_forward_context
-from vllm.platforms import current_platform
+from vllm.config.model import ModelConfig
 
 logger = logging.getLogger(__name__)
 
 
-def _get_num_experts_per_tok(hf_config) -> int:
-    """Resolve the per-token expert count from the HF config.
+# ---------------------------------------------------------------------------
+# Custom op for routing capture -- traceable by torch.compile / Dynamo.
+#
+# Registered as a formal custom op so that torch.compile traces through it
+# cleanly without graph breaks.  ALL TP ranks call this op with a real
+# device buffer to ensure identical CUDA graph structure (symmetry).
+# Non-rank-0 buffers are written but never read for D2H.
+# ---------------------------------------------------------------------------
 
-    Different model families store this under different attribute names
-    (e.g. ``num_experts_per_tok`` for DeepSeek, ``top_k_experts`` for Gemma 4).
-    """
-    val = getattr(hf_config, "num_experts_per_tok", None)
-    if val is None:
-        val = getattr(hf_config, "top_k_experts", None)
-    if val is None:
-        raise ValueError(
-            "Cannot determine num_experts_per_tok: HF config has neither "
-            "'num_experts_per_tok' nor 'top_k_experts'"
-        )
-    return val
 
+@torch.library.custom_op("vllm::capture_routing", mutates_args={"buffer"})
+def capture_routing_op(
+    buffer: torch.Tensor,
+    topk_ids: torch.Tensor,
+    layer_id: int,
+    batch_size: int,
+) -> None:
+    buffer[layer_id, :batch_size, :].copy_(
+        topk_ids[:batch_size].to(buffer.dtype), non_blocking=True
+    )
 
-# Constants
-_TMP_DIR = tempfile.gettempdir()
-_LOCK_FILE_PREFIX = os.path.join(_TMP_DIR, "vllm_routed_experts")
-_BUFFER_PREFIX = "vllm_routed_experts_buffer"
 
-# Global singleton instances
-_global_experts_capturer: RoutedExpertsCapturer | None = None
-_global_experts_reader: RoutedExpertsReader | None = None
+@capture_routing_op.register_fake
+def _capture_routing_op_fake(
+    buffer: torch.Tensor,
+    topk_ids: torch.Tensor,
+    layer_id: int,
+    batch_size: int,
+) -> None:
+    pass
 
 
-@contextmanager
-def _file_lock(lock_file: str, mode: str = "wb+") -> Generator[None, None, None]:
-    """Context manager for file-based locking."""
-    with open(lock_file, mode) as fp:
-        fcntl.flock(fp, fcntl.LOCK_EX)
-        try:
-            yield
-        finally:
-            fcntl.flock(fp, fcntl.LOCK_UN)
+_MB = 1024 * 1024
 
 
-def _create_or_attach_shared_memory(
-    name: str, size: int, lock_file: str
-) -> shared_memory.SharedMemory:
-    """Create or attach to shared memory with proper locking."""
-    # Ensure lock file exists before acquiring lock
-    with open(lock_file, "wb"):
-        pass
+class _RoutedExpertsDeviceCache:
+    """Per-device (GPU) cache for capturing routed expert IDs during forward
+    pass.  Always writes at row 0 so that CUDA graph replay sees the same
+    addresses that were recorded at capture time.
+    """
 
-    with _file_lock(lock_file):
-        try:
-            shm = shared_memory.SharedMemory(name=name, create=True, size=size)
-        except FileExistsError:
-            shm = shared_memory.SharedMemory(name=name, create=False, size=size)
+    DTYPE = torch.int16
 
-        if shm.size != size:
-            logger.warning(
-                "Shared memory %s size mismatch; recreating",
-                name,
-            )
-            shm.close()
-            shm.unlink()
-            try:
-                shm = shared_memory.SharedMemory(name=name, create=True, size=size)
-                logger.info("Created shared memory %s", name)
-            except FileExistsError:
-                shm = shared_memory.SharedMemory(name=name, create=False, size=size)
-                logger.info("Linked to existing shared memory %s", name)
+    def __init__(
+        self,
+        max_num_batched_tokens: int,
+        num_hidden_layers: int,
+        num_experts_per_tok: int,
+        device: str,
+    ) -> None:
+        # Layout: (L, N, K) so that buffer[layer_id] is a contiguous (N, K)
+        # view — required by the FlashInfer routing-replay kernel which
+        # writes expert IDs assuming contiguous row-major memory.
+        self.num_hidden_layers = num_hidden_layers
+        self.buffer = torch.zeros(
+            (num_hidden_layers, max_num_batched_tokens, num_experts_per_tok),
+            dtype=self.DTYPE,
+            device=device,
+        )
+        self._finalize_allocation_log()
+
+    def get_buffer_size_bytes(self):
+        return self.buffer.nbytes
+
+    def capture_fwd_routed_experts(self, layer_id: int, topk_ids: torch.Tensor):
+        assert layer_id is not None, "capturing routing experts but get layer_id None"
+        batch, _ = topk_ids.shape
+        self.buffer[layer_id, :batch, :].copy_(topk_ids, non_blocking=True)
+
+    def _finalize_allocation_log(self):
+        buf_mb = self.get_buffer_size_bytes() / _MB
+        logger.info(
+            "Routing experts device buffer allocated. shape=%s, size=%.2f MB",
+            tuple(self.buffer.shape),
+            buf_mb,
+        )
 
-    return shm
 
+class _RoutedExpertsHostCache:
+    """Host (CPU) cache using numpy arrays for per-request routing data.
 
-class RoutedExpertsCapturer:
+    Numpy arrays avoid torch dispatcher overhead for scatter operations.
+    Lazy per-request allocation avoids a massive up-front buffer.
     """
-    Capturer for routed experts with device and optional shared memory buffer.
 
-    This class captures expert routing decisions during model forward passes
-    and optionally stores them in shared memory for cross-process access.
-    """
+    DTYPE = np.int16
+
+    def __init__(
+        self,
+        num_hidden_layers: int,
+        num_experts_per_tok: int,
+        max_model_len: int,
+    ) -> None:
+        self.max_model_len = max_model_len
+        self.num_hidden_layers = num_hidden_layers
+        self.num_experts_per_tok = num_experts_per_tok
+
+        self._req_buffers: dict[str, np.ndarray] = {}
+        self._filled_len: dict[str, int] = {}
+        self._total_allocated_bytes = 0
 
-    _instance: RoutedExpertsCapturer | None = None
+        self._finalize_allocation_log()
 
-    def __init__(self) -> None:
-        self._device_buffer: torch.Tensor | None = None
-        self._shm: shared_memory.SharedMemory | None = None
-        self._host_buffer_view: np.ndarray | None = None
-        self._lock_file: str | None = None
+    def get_buffer_size_bytes(self) -> int:
+        return self._total_allocated_bytes
 
-    @classmethod
-    def create(cls) -> RoutedExpertsCapturer:
-        """Create a global singleton instance."""
-        global _global_experts_capturer
-        if _global_experts_capturer is not None:
-            raise RuntimeError("Experts capturer already created.")
+    def get_or_grow_buffer(self, req_id: str, max_pos: int) -> np.ndarray:
+        required_len = max_pos + 1
 
-        _global_experts_capturer = cls()
-        return _global_experts_capturer
+        if req_id not in self._req_buffers:
+            buf = np.zeros(
+                (required_len, self.num_hidden_layers, self.num_experts_per_tok),
+                dtype=self.DTYPE,
+            )
+            self._req_buffers[req_id] = buf
+            self._total_allocated_bytes += buf.nbytes
+            return buf
+
+        buf = self._req_buffers[req_id]
+        if buf.shape[0] >= required_len:
+            return buf
+
+        new_len = min(max(required_len, buf.shape[0] * 2), self.max_model_len)
+        new_buf = np.zeros(
+            (new_len, self.num_hidden_layers, self.num_experts_per_tok),
+            dtype=self.DTYPE,
+        )
+        new_buf[: buf.shape[0]] = buf
+        self._total_allocated_bytes += new_buf.nbytes - buf.nbytes
+        self._req_buffers[req_id] = new_buf
+        return new_buf
+
+    def get_buffer(self, req_id: str) -> np.ndarray | None:
+        return self._req_buffers.get(req_id)
+
+    def update_filled_len(self, req_id: str, max_pos: int) -> None:
+        new_len = max_pos + 1
+        self._filled_len[req_id] = max(self._filled_len.get(req_id, 0), new_len)
+
+    def get_filled_len(self, req_id: str) -> int:
+        return self._filled_len.get(req_id, 0)
+
+    def free_request(self, req_id: str) -> None:
+        if req_id in self._req_buffers:
+            self._total_allocated_bytes -= self._req_buffers.pop(req_id).nbytes
+        self._filled_len.pop(req_id, None)
+
+    def _finalize_allocation_log(self):
+        logger.info(
+            "Routing experts host cache initialized (lazy allocation). "
+            "max_model_len=%s, layers=%s, experts_per_tok=%s",
+            self.max_model_len,
+            self.num_hidden_layers,
+            self.num_experts_per_tok,
+        )
 
+
+class RoutedExpertsCapturer(ABC):
     @staticmethod
-    def get_instance() -> RoutedExpertsCapturer | None:
-        """Get the global singleton instance."""
-        return _global_experts_capturer
+    def create(
+        enable: bool,
+        model_config: ModelConfig,
+        num_fused_shared_experts: int,
+        max_num_batched_tokens: int,
+        max_model_len: int,
+        device: str,
+        shared_host_cache: _RoutedExpertsHostCache | None = None,
+        skip_host_cache: bool = False,
+    ):
+        if enable:
+            return _RoutedExpertsCapturerReal(
+                model_config,
+                max_num_batched_tokens=max_num_batched_tokens,
+                num_fused_shared_experts=num_fused_shared_experts,
+                max_model_len=max_model_len,
+                device=device,
+                shared_host_cache=shared_host_cache,
+                skip_host_cache=skip_host_cache,
+            )
+        return _RoutedExpertsCapturerNoop()
+
+    @abstractmethod
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        raise NotImplementedError
+
+    def get_routed_experts(
+        self, req_id: str, seqlen: int | None = None, free_slot: bool = True
+    ):
+        raise NotImplementedError
 
-    def init_buffer(
+    def sync_fwd_experts_buffer_DtoH(
         self,
-        max_num_batched_tokens: int,
-        max_num_kv_tokens: int,
-        vllm_config: VllmConfig,
-    ) -> None:
-        """
-        Initialize the device buffer and optionally shared memory buffer.
+        positions: torch.Tensor,
+        num_scheduled_tokens: dict[str, int],
+    ):
+        raise NotImplementedError
 
-        Args:
-            max_num_batched_tokens: Maximum number of tokens in a batch.
-            max_num_kv_tokens: Maximum number of KV tokens for shared memory.
-            vllm_config: vllm configuration containing layer and expert info.
-        """
+    def finalize_pending_copy(self):
+        raise NotImplementedError
 
-        if self._device_buffer is not None:
-            raise RuntimeError("Device buffer has already been initialized")
+    def get_host_cache(self):
+        raise NotImplementedError
 
-        hf_config = vllm_config.model_config.hf_text_config
-        num_layers = hf_config.num_hidden_layers
-        num_experts_per_tok = _get_num_experts_per_tok(hf_config)
+    def get_device_cache(self):
+        raise NotImplementedError
 
-        # Initialize device buffer
-        self._device_buffer = torch.zeros(
-            (max_num_batched_tokens, num_layers, num_experts_per_tok),
-            dtype=torch.int32,
-            device=current_platform.device_type,
-        )
-        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
 
-        if get_tensor_model_parallel_rank() != 0:
-            return
+def _count_moe_layers(hf_config) -> int:
+    """Count the number of MoE layers in a model.
 
-        # Initialize shared memory
-        shape = (max_num_kv_tokens, num_layers, num_experts_per_tok)
-        buffer_size = int(np.prod(shape)) * np.dtype(np.int32).itemsize
-        instance_id = vllm_config.instance_id
-        self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}_{self.dp_rank}.lock"
-        shm_name = f"{_BUFFER_PREFIX}_{instance_id}_{self.dp_rank}"
+    Resolves three known config shapes:
+    - Nemotron-style: an explicit ``layers_block_type`` list with "moe" entries.
+    - Qwen3MoE / DeepSeek-style sparse: ``decoder_sparse_step > 1`` with optional
+      ``mlp_only_layers`` exclusions.
+    - Default: every layer is MoE except those listed in ``mlp_only_layers``.
+    """
+    layers_block_type = getattr(hf_config, "layers_block_type", None)
+    if layers_block_type is not None:
+        return layers_block_type.count("moe")
+    n = hf_config.num_hidden_layers
+    mlp_only = getattr(hf_config, "mlp_only_layers", None) or []
+    step = getattr(hf_config, "decoder_sparse_step", 1) or 1
+    if step > 1:
+        return sum(1 for i in range(n) if (i + 1) % step == 0 and i not in mlp_only)
+    return n - sum(1 for i in mlp_only if 0 <= i < n)
+
+
+class _RoutedExpertsCapturerReal(RoutedExpertsCapturer):
+    """Capturer with GPU device cache and CPU host cache.
+
+    Performance strategy -- async D2H with optimized host-cache scatter:
+
+    Every decode step we issue a non-blocking D2H copy on a dedicated
+    CUDA stream.  The scatter into per-request host-cache buffers is
+    deferred to the start of the NEXT step (by which time the copy has
+    finished).  The scatter loop is optimized with direct scalar access
+    to avoid numpy slice views, int() conversions, and .max() calls.
+
+    At extraction time (when a request finishes), data is already in a
+    contiguous host buffer -- just a numpy slice, no concatenation.
+    """
 
-        self._shm = _create_or_attach_shared_memory(
-            shm_name, buffer_size, self._lock_file
-        )
-        self._host_buffer_view = np.ndarray(shape, dtype=np.int32, buffer=self._shm.buf)
-        self._host_buffer_view.fill(0)
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        max_num_batched_tokens: int,
+        num_fused_shared_experts: int,
+        max_model_len: int,
+        device: str,
+        shared_host_cache: _RoutedExpertsHostCache | None = None,
+        skip_host_cache: bool = False,
+    ):
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.num_hidden_layers = _count_moe_layers(model_config.hf_text_config)
+        self.num_experts_per_tok = model_config.hf_text_config.num_experts_per_tok
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_model_len = max_model_len
+        self._skip_host_cache = skip_host_cache
+
+        if skip_host_cache:
+            self.host_cache = None
+            logger.info("Skipping host cache for device %s (non-rank-0)", device)
+        elif shared_host_cache is not None:
+            self.host_cache = shared_host_cache
+        else:
+            self.host_cache = _RoutedExpertsHostCache(
+                num_hidden_layers=self.num_hidden_layers,
+                num_experts_per_tok=self.num_experts_per_tok,
+                max_model_len=self.max_model_len,
+            )
 
-        logger.debug(
-            "Created shared memory buffer '%s' with shape %s",
-            shm_name,
-            shape,
+        self.device_cache = _RoutedExpertsDeviceCache(
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            num_hidden_layers=self.num_hidden_layers,
+            num_experts_per_tok=self.num_experts_per_tok,
+            device=device,
         )
 
-    def capture(self, layer_id: int, topk_ids: torch.Tensor) -> None:
-        """
-        Capture expert routing decisions for a specific layer.
+        # ---- Async D2H pipeline (rank-0 only) ----
+        # Non-rank-0 workers only need the device buffer for symmetric
+        # CUDA graph capture; they skip the D2H pipeline entirely.
+        self._has_pending_copy = False
+        self._pending_positions: np.ndarray | None = None
+        self._pending_num_scheduled: dict[str, int] | None = None
+        self._pending_total_tokens: int = 0
+
+        if not skip_host_cache:
+            # Same (L, N, K) layout as device_cache.buffer.
+            self._pinned_staging = torch.zeros(
+                (
+                    self.num_hidden_layers,
+                    max_num_batched_tokens,
+                    self.num_experts_per_tok,
+                ),
+                dtype=_RoutedExpertsDeviceCache.DTYPE,
+                pin_memory=True,
+            )
+            # Private device snapshot: source for the async D2H. Decouples
+            # the in-flight copy from device_cache.buffer, which the next
+            # step's MoE writes overwrite in place on main_stream.
+            self._device_staging = torch.empty_like(self.device_cache.buffer)
+            self._copy_stream = torch.cuda.Stream(device=device)
+            self._copy_event = torch.cuda.Event()
+
+            pinned_mb = self._pinned_staging.nbytes / _MB
+            logger.info(
+                "Routing experts pinned staging buffer allocated. "
+                "shape=%s, size=%.2f MB",
+                tuple(self._pinned_staging.shape),
+                pinned_mb,
+            )
+        else:
+            self._pinned_staging = None
+            self._device_staging = None
+            self._copy_stream = None
+            self._copy_event = None
+            logger.info(
+                "Routing experts device-only capturer (rank != 0). "
+                "Device buffer shape=%s",
+                tuple(self.device_cache.buffer.shape),
+            )
 
-        Args:
-            layer_id: The layer index.
-            topk_ids: Tensor of shape (batch_size, num_routed_experts).
-        """
-        if self._device_buffer is None:
-            raise RuntimeError("Buffer not initialized. Call init_buffer() first.")
-
-        ctx = get_forward_context()
-        if ctx.dp_metadata is None:  # single dp
-            start_loc = 0
-            end_loc = topk_ids.shape[0]
-            token_num_per_dp = topk_ids.shape[0]
-        else:  # multi dp
-            num_tokens_dp = ctx.dp_metadata.num_tokens_across_dp_cpu
-            token_num_per_dp = int(num_tokens_dp[self.dp_rank].item())
-            total = int(num_tokens_dp.sum().item())
-            n = topk_ids.shape[0]
-
-            if n == total:
-                # Naive dispatch: all DP ranks' tokens concatenated before routing.
-                cumsum = torch.cumsum(num_tokens_dp, dim=0)
-                end_loc = int(cumsum[self.dp_rank].item())
-                start_loc = end_loc - token_num_per_dp
-            elif n == token_num_per_dp:
-                # Modular-kernel path: DP combine happens inside quant_method.apply;
-                # select_experts only sees this rank's tokens.
-                start_loc = 0
-                end_loc = token_num_per_dp
-            else:
-                raise AssertionError(
-                    "RoutedExpertsCapturer: unexpected topk_ids batch dim "
-                    f"{n} (expected {total} or {token_num_per_dp} "
-                    f"for dp_rank={self.dp_rank})"
-                )
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        self.device_cache.capture_fwd_routed_experts(layer_id, topk_ids)
+
+    # ------------------------------------------------------------------
+    # sync_fwd_experts_buffer_DtoH -- called AFTER the forward pass
+    # ------------------------------------------------------------------
 
-        if layer_id >= self._device_buffer.shape[1]:
+    def sync_fwd_experts_buffer_DtoH(
+        self,
+        positions: torch.Tensor,
+        num_scheduled_tokens: dict[str, int],
+    ):
+        if self.host_cache is None:
             return
 
-        self._device_buffer[:token_num_per_dp, layer_id, :] = topk_ids[
-            start_loc:end_loc, :
-        ]
+        # 1. Finalize previous async copy -- the copy had an entire
+        #    forward pass to complete so event.synchronize() is ~free.
+        if self._has_pending_copy:
+            self._copy_event.synchronize()
+            self._scatter_to_host()
+            self._has_pending_copy = False
 
-    def clear_buffer(self) -> None:
-        """Clear the device buffer."""
-        if self._device_buffer is not None:
-            self._device_buffer.zero_()
+        total_tokens = sum(num_scheduled_tokens.values())
+        if total_tokens == 0:
+            return
 
-    def save_captured_experts(self, indices: np.ndarray) -> None:
-        """
-        Save captured experts from device buffer to shared memory.
+        # 2. Snapshot the device buffer on main_stream into a private
+        #    staging buffer, then issue the D2H from the staging buffer
+        #    on a dedicated copy stream. The snapshot serializes after
+        #    the current step's MoE writes (same stream) and is private
+        #    from the next step's MoE writes, so the in-flight D2H is
+        #    not aliased by step N+1's forward under async scheduling.
+        main_stream = torch.cuda.current_stream(self._copy_stream.device)
+        self._device_staging[:, :total_tokens, :].copy_(
+            self.device_cache.buffer[:, :total_tokens, :], non_blocking=True
+        )
+        with torch.cuda.stream(self._copy_stream):
+            self._copy_stream.wait_stream(main_stream)
+            self._pinned_staging[:, :total_tokens, :].copy_(
+                self._device_staging[:, :total_tokens, :], non_blocking=True
+            )
+            self._copy_event.record()
 
-        Args:
-            indices: Array of indices indicating where to store the data.
-        """
-        if get_tensor_model_parallel_rank() != 0:
-            return
-        if self._lock_file is None:
-            raise RuntimeError("Shared memory not initialized.")
-        if self._host_buffer_view is None:
-            return
-        if self._device_buffer is None:
-            raise RuntimeError("Device buffer not initialized.")
+        # 3. Save metadata for deferred scatter.
+        self._pending_positions = positions.numpy().copy()
+        self._pending_num_scheduled = num_scheduled_tokens
+        self._pending_total_tokens = total_tokens
+        self._has_pending_copy = True
 
-        num_tokens = len(indices)
-        data = self._device_buffer[:num_tokens, :, :].cpu().numpy()
+    # ------------------------------------------------------------------
+    # Optimized scatter into pre-allocated host-cache buffers
+    # ------------------------------------------------------------------
 
-        with _file_lock(self._lock_file):
-            self._host_buffer_view[indices, :, :] = data
+    def _scatter_to_host(self):
+        """Scatter D2H data into per-request host cache buffers.
 
-    def cleanup(self) -> None:
-        """Explicitly clean up shared memory resources."""
-        if self._shm is not None:
-            try:
-                self._shm.close()
-                self._shm.unlink()
-            except Exception:
-                logger.debug("Exception during cleanup for capturer", exc_info=True)
-            finally:
-                self._shm = None
+        Staging layout is (L, N, K).  Host cache layout is (seq_len, L, K).
+        We transpose the staging slice to (N, L, K) before scattering so
+        that indexing by token position naturally yields (L, K) rows.
+        """
+        # Transpose (L, N, K) -> (N, L, K) for the active token range.
+        host_values = (
+            self._pinned_staging[:, : self._pending_total_tokens, :]
+            .numpy()
+            .transpose(1, 0, 2)
+        )
+        positions_np = self._pending_positions
+        host_cache = self.host_cache
+        assert self._pending_num_scheduled is not None
+        assert positions_np is not None
+        assert host_cache is not None
+
+        offset = 0
+        for req_id, n_tokens in self._pending_num_scheduled.items():
+            if n_tokens == 0:
+                continue
+
+            if n_tokens == 1:
+                pos_val = int(positions_np[offset])
+                buf = host_cache.get_or_grow_buffer(req_id, pos_val)
+                buf[pos_val] = host_values[offset]
+                host_cache.update_filled_len(req_id, pos_val)
+            else:
+                pos = positions_np[offset : offset + n_tokens]
+                max_pos = int(pos[-1]) if n_tokens > 0 else 0
+                if n_tokens > 1:
+                    max_pos = int(pos.max())
+                buf = host_cache.get_or_grow_buffer(req_id, max_pos)
+                buf[pos] = host_values[offset : offset + n_tokens]
+                host_cache.update_filled_len(req_id, max_pos)
+
+            offset += n_tokens
+
+        self._pending_positions = None
+        self._pending_num_scheduled = None
+        self._pending_total_tokens = 0
+
+    # ------------------------------------------------------------------
+    # finalize_pending_copy -- call before reading host cache
+    # ------------------------------------------------------------------
+
+    def finalize_pending_copy(self):
+        """Ensure the most recent async D2H copy has been scattered into
+        host cache buffers.  Call before get_routed_experts."""
+        if self._has_pending_copy:
+            self._copy_event.synchronize()
+            self._scatter_to_host()
+            self._has_pending_copy = False
+
+    # ------------------------------------------------------------------
+    # Extraction -- O(1), just a numpy slice
+    # ------------------------------------------------------------------
+
+    def get_routed_experts(
+        self,
+        req_id: str,
+        seqlen: int | None = None,
+        free_slot: bool = True,
+    ):
+        if self.host_cache is None:
+            return None
+        buf = self.host_cache.get_buffer(req_id)
+        if buf is None:
+            return None
+        filled = self.host_cache.get_filled_len(req_id)
+        if filled <= 0:
+            return None
+        effective_len = min(filled, seqlen) if seqlen is not None else filled
+        result = buf[:effective_len].copy()
+        if free_slot:
+            self.host_cache.free_request(req_id)
+        return result
+
+    def get_host_cache(self):
+        return self.host_cache
+
+    def get_device_cache(self):
+        return self.device_cache
+
+
+class _RoutedExpertsCapturerNoop(RoutedExpertsCapturer):
+    def __init__(self):
+        pass
 
-    def __del__(self) -> None:
-        """Clean up shared memory on destruction."""
-        self.cleanup()
+    def capture(self, layer_id: int, topk_ids: torch.Tensor):
+        pass
 
+    def get_routed_experts(self, req_id: str, seqlen=None, free_slot=True):
+        return None
 
-class RoutedExpertsReader:
-    """
-    Reader for routed experts from shared memory.
+    def sync_fwd_experts_buffer_DtoH(self, positions, num_scheduled_tokens):
+        pass
 
-    This class attaches to shared memory created by RoutedExpertsCapturer
-    and reads expert routing decisions.
-    """
+    def finalize_pending_copy(self):
+        pass
 
-    _instance: RoutedExpertsReader | None = None
+    def get_host_cache(self):
+        return None
 
-    def __init__(self) -> None:
-        self._shm: shared_memory.SharedMemory | None = None
-        self._host_buffer_view: np.ndarray | None = None
-        self._lock_file: str | None = None
+    def get_device_cache(self):
+        pass
 
-    @classmethod
-    def create(cls) -> RoutedExpertsReader:
-        """Create a global singleton instance."""
-        global _global_experts_reader
-        if _global_experts_reader is not None:
-            raise RuntimeError("Experts reader already created.")
 
-        _global_experts_reader = cls()
-        return _global_experts_reader
+# Global capturer instance (per-process)
+_global_expert_capturer: RoutedExpertsCapturer | None = _RoutedExpertsCapturerNoop()
+_shared_host_cache: _RoutedExpertsHostCache | None = None
 
-    @staticmethod
-    def get_instance() -> RoutedExpertsReader | None:
-        """Get the global singleton instance."""
-        if _global_experts_reader is None:
-            logger.info("Experts reader not initialized.")
-        return _global_experts_reader
 
-    def attach_buffer(
-        self,
-        max_num_kv_tokens: int,
-        vllm_config: VllmConfig,
-    ) -> None:
-        """
-        Attach to an existing shared memory buffer.
+def get_global_experts_capturer():
+    return _global_expert_capturer
 
-        Args:
-            max_num_kv_tokens: Maximum number of KV tokens.
-            vllm_config: vllm configuration.
-        """
-        if self._shm is not None:
-            logger.warning("Already attached to shared memory buffer.")
-            return  # Already attached
-
-        hf_config = vllm_config.model_config.hf_text_config
-        shape = (
-            max_num_kv_tokens,
-            hf_config.num_hidden_layers,
-            _get_num_experts_per_tok(hf_config),
-        )
 
-        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
-        instance_id = vllm_config.instance_id
-        self._lock_file = f"{_LOCK_FILE_PREFIX}_{instance_id}_{self.dp_rank}.lock"
-        shm_name = f"{_BUFFER_PREFIX}_{instance_id}_{self.dp_rank}"
+def set_global_experts_capturer(capturer: RoutedExpertsCapturer):
+    global _global_expert_capturer
+    _global_expert_capturer = capturer
 
-        with _file_lock(self._lock_file, mode="rb+"):
-            # Avoid resource_tracker registering the shared memory
-            with patch(
-                "multiprocessing.resource_tracker.register",
-                lambda *args, **kwargs: None,
-            ):
-                self._shm = shared_memory.SharedMemory(name=shm_name)
 
-            self._host_buffer_view = np.ndarray(
-                shape, dtype=np.int32, buffer=self._shm.buf
-            )
+def extract_routed_experts_for_current_batch(
+    req_ids: list[str],
+    requests: dict,
+    req_id_to_index: dict[str, int],
+    num_tokens_no_spec: np.ndarray,
+    max_model_len: int,
+) -> dict[str, np.ndarray] | None:
+    """Extract routed experts for requests predicted to finish this step.
 
-    def get_routed_experts(self, indices: np.ndarray) -> np.ndarray:
-        """
-        Read routed expert data from shared memory.
+    Checks all stop conditions the scheduler will check (max_tokens,
+    EOS token, stop tokens, max_model_len) so that every finished
+    request gets its routing data attached to the ModelRunnerOutput.
 
-        Args:
-            indices: Array of indices to read.
+    Args:
+        req_ids: Ordered request IDs for the current batch.
+        requests: Map of req_id to CachedRequestState (read-only).
+        req_id_to_index: Map of req_id to input batch index.
+        num_tokens_no_spec: Array of total token counts per request index.
+        max_model_len: Maximum model sequence length.
+    """
+    capturer = get_global_experts_capturer()
+    if capturer is None:
+        return None
+    host_cache = capturer.get_host_cache()
+    if host_cache is None:
+        return None
+
+    finishing_req_ids: list[str] = []
+    for req_id in req_ids:
+        req_state = requests.get(req_id)
+        if req_state is None:
+            continue
+        sp = req_state.sampling_params
+        if sp is None:
+            continue
+        output_ids = req_state.output_token_ids
+        if not output_ids:
+            continue
+        if len(output_ids) < sp.min_tokens:
+            continue
+
+        finishing = False
+        last_token = output_ids[-1]
+
+        # EOS token (mirrors check_stop: eos_token_id is None
+        # when ignore_eos=True, so this naturally respects that)
+        if last_token == sp.eos_token_id:
+            finishing = True
+
+        # Explicit stop token IDs
+        if not finishing and sp.stop_token_ids and last_token in sp.stop_token_ids:
+            finishing = True
+
+        # max_tokens / max_model_len length cap
+        if not finishing:
+            if sp.max_tokens is not None and len(output_ids) >= sp.max_tokens:
+                finishing = True
+            else:
+                req_idx = req_id_to_index.get(req_id)
+                if req_idx is not None:
+                    total = num_tokens_no_spec[req_idx]
+                    if total >= max_model_len:
+                        finishing = True
+
+        if finishing:
+            finishing_req_ids.append(req_id)
+
+    if not finishing_req_ids:
+        return None
+
+    # At least one request is finishing: ensure the latest async D2H
+    # copy has been scattered into the host cache.
+    capturer.finalize_pending_copy()
+
+    result: dict[str, np.ndarray] = {}
+    for req_id in finishing_req_ids:
+        seqlen = host_cache.get_filled_len(req_id)
+        if seqlen <= 0:
+            continue
+        experts = capturer.get_routed_experts(req_id, seqlen=seqlen, free_slot=False)
+        if experts is not None:
+            result[req_id] = experts
+
+    return result if result else None
+
+
+def free_routing_buffers(
+    finished_req_ids: set[str],
+    preempted_req_ids: set[str] | None = None,
+) -> None:
+    """Free host cache buffers for finished and preempted requests.
+
+    Finished requests had their routing data extracted in the previous
+    step.
+
+    Preempted requests are re-prefilled from scratch when they resume,
+    so their host-cache buffer is freed here. This means any routing
+    already accumulated in the host cache for the preempted request is
+    dropped without being emitted on a ``ModelRunnerOutput`` --
+    consumers see ``routed_experts=None`` for those requests with no
+    other signal. Partial-rollout / async-RL pipelines that depend on
+    receiving routing for preempted requests should treat preemption
+    as a routing-data loss event and either keep preemption disabled
+    or reconstruct routing on the resumed prefill.
+    """
+    capturer = get_global_experts_capturer()
+    if capturer is None:
+        return
+    host_cache = capturer.get_host_cache()
+    if host_cache is None:
+        return
+
+    for req_id in finished_req_ids:
+        host_cache.free_request(req_id)
+    if preempted_req_ids:
+        for req_id in preempted_req_ids:
+            host_cache.free_request(req_id)
+
+
+def issue_routing_d2h_copy(
+    input_batch_req_ids: list[str],
+    num_scheduled_tokens: dict[str, int],
+    positions: torch.Tensor,
+    positions_cpu: torch.Tensor,
+) -> None:
+    """Issue async D2H copy of routed experts after the forward pass.
+
+    Called EARLY in the execute_model epilogue so the copy overlaps with
+    eplb, kv_connector finalization, and draft work.
+    finalize_pending_copy() + get_routed_experts() happen later in
+    extract_routed_experts_for_current_batch().
+    """
+    capturer = get_global_experts_capturer()
+    if capturer is None:
+        return
+
+    ordered = {
+        req_id: num_scheduled_tokens[req_id]
+        for req_id in input_batch_req_ids
+        if req_id in num_scheduled_tokens
+    }
+    n = sum(ordered.values())
+    positions_cpu[:n].copy_(positions[:n])
+    capturer.sync_fwd_experts_buffer_DtoH(
+        positions=positions_cpu[:n],
+        num_scheduled_tokens=ordered,
+    )
+
+
+def split_routed_experts(
+    routed_experts: np.ndarray,
+    prompt_len: int,
+    num_output_tokens: int | None = None,
+) -> tuple[np.ndarray | None, np.ndarray | None]:
+    """Split routing data into prompt and generation portions.
+
+    Args:
+        routed_experts: Full routing array of shape (seq_len, L, K).
+        prompt_len: Number of prompt tokens for the request.
+        num_output_tokens: Actual number of generated tokens (from
+            detokenizer).  When provided, the generation portion is
+            clipped to this length — necessary with MTP where the model
+            runner may capture routing for more tokens than the final
+            output contains.
+
+    Returns:
+        (prompt_routed_experts, gen_routed_experts) numpy arrays, either
+        of which may be None if the corresponding portion is empty.
+    """
+    prompt_routed_experts = routed_experts[:prompt_len]
+    gen_routed_experts = routed_experts[prompt_len:]
+
+    # Clip generation routing to match actual output tokens.
+    if (
+        num_output_tokens is not None
+        and gen_routed_experts.shape[0] > num_output_tokens
+        and num_output_tokens > 0
+    ):
+        gen_routed_experts = gen_routed_experts[:num_output_tokens]
+
+    if prompt_routed_experts.size == 0:
+        prompt_routed_experts = None
+    if gen_routed_experts.size == 0:
+        gen_routed_experts = None
+
+    return prompt_routed_experts, gen_routed_experts
+
+
+def get_shared_host_cache() -> _RoutedExpertsHostCache | None:
+    return _shared_host_cache
+
+
+def create_shared_host_cache(
+    model_config: ModelConfig,
+    max_model_len: int,
+) -> _RoutedExpertsHostCache:
+    global _shared_host_cache
+    num_hidden_layers = _count_moe_layers(model_config.hf_text_config)
+    num_experts_per_tok = model_config.hf_text_config.num_experts_per_tok
+    _shared_host_cache = _RoutedExpertsHostCache(
+        num_hidden_layers=num_hidden_layers,
+        num_experts_per_tok=num_experts_per_tok,
+        max_model_len=max_model_len,
+    )
+    return _shared_host_cache
+
+
+def init_routed_experts_capturer_with_shared_cache(
+    enable: bool,
+    model_config: ModelConfig,
+    num_fused_shared_experts: int,
+    max_num_batched_tokens: int,
+    max_model_len: int,
+    device: str,
+    rank: int = 0,
+    world_size: int = 1,
+) -> RoutedExpertsCapturer:
+    """Initialize capturer with rank-aware handling (only rank 0 captures)."""
+    if not enable:
+        capturer = _RoutedExpertsCapturerNoop()
+        set_global_experts_capturer(capturer)
+        return capturer
+
+    if world_size > 1 and rank != 0:
+        # Non-rank-0 workers get a device-only capturer (no host cache,
+        # no D2H pipeline) so that ALL ranks have a real device buffer.
+        # This ensures the custom op call in every MoE layer produces
+        # identical CUDA graph structure across TP ranks.
+        logger.info("Creating device-only routed experts capturer for rank %s", rank)
+        capturer = RoutedExpertsCapturer.create(
+            enable=True,
+            model_config=model_config,
+            num_fused_shared_experts=num_fused_shared_experts,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_model_len=max_model_len,
+            device=device,
+            skip_host_cache=True,
+        )
+        set_global_experts_capturer(capturer)
+        return capturer
+
+    capturer = RoutedExpertsCapturer.create(
+        enable=True,
+        model_config=model_config,
+        num_fused_shared_experts=num_fused_shared_experts,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_model_len,
+        device=device,
+        skip_host_cache=False,
+    )
+    set_global_experts_capturer(capturer)
+    return capturer
+
+
+def bind_routing_capture_to_model(model) -> None:
+    """Bind routing capture buffers to all FusedMoE layers in the model.
+
+    Must be called AFTER init_routed_experts_capturer_with_shared_cache()
+    and BEFORE CUDA graph capture.  All TP ranks get a real buffer so
+    that the custom op call produces identical graph structure.
+    """
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+    capturer = get_global_experts_capturer()
+    device_cache = capturer.get_device_cache()
+    if device_cache is None:
+        return  # routing capture not enabled
+
+    buffer = device_cache.buffer
+
+    # Mark the buffer so CUDA graphs do NOT snapshot/restore its contents.
+    if hasattr(torch.compiler, "cudagraph_mark_tensor_static"):
+        torch.compiler.cudagraph_mark_tensor_static(buffer)
+    elif hasattr(torch._C, "_set_static_address_tag"):
+        torch._C._set_static_address_tag(buffer, True)
+    with contextlib.suppress(Exception):
+        torch._dynamo.mark_static_address(buffer)
+
+    bound = 0
+    for module in model.modules():
+        if isinstance(module, FusedMoE) and hasattr(module, "moe_layer_id"):
+            # Per-FusedMoE configurations not yet validated for routing
+            # capture. These signals are only set after model init, so a
+            # config-level guard cannot see them.
+            if module.moe_config.is_sequence_parallel:
+                raise NotImplementedError(
+                    "routed-experts capture is not yet validated with "
+                    "sequence parallelism on the FusedMoE layer "
+                    "(moe_config.is_sequence_parallel=True)."
+                )
+            if (
+                module.moe_config.dp_size > 1
+                and not module.quant_method.supports_internal_mk
+            ):
+                raise NotImplementedError(
+                    "routed-experts capture is not yet validated with "
+                    "naive DP dispatch (non-modular quant method "
+                    f"{type(module.quant_method).__name__}, "
+                    f"dp_size={module.moe_config.dp_size})."
+                )
 
-        Returns:
-            Copy of the expert routing data for the given indices.
-        """
-        if self._host_buffer_view is None:
-            raise RuntimeError("Buffer not attached. Call attach_buffer() first.")
-        if self._lock_file is None:
-            raise RuntimeError("Lock file not initialized.")
-
-        with _file_lock(self._lock_file, mode="rb+"):
-            return self._host_buffer_view[indices, :, :].copy()
-
-    def cleanup(self) -> None:
-        """Explicitly clean up resources (close without unlink)."""
-        if self._shm is not None:
-            try:
-                self._shm.close()
-            except Exception:
-                logger.debug("Exception during cleanup for reader", exc_info=True)
-            finally:
-                self._shm = None
-
-    def __del__(self) -> None:
-        """Close shared memory on destruction (do not unlink)."""
-        self.cleanup()
+            layer_id = module.moe_layer_id
+            layer_buf = buffer[layer_id]  # (N_max, K)
+            module._routing_replay_out = layer_buf
+            # Mark each per-layer view as static so CUDA graphs don't
+            # snapshot/restore or relocate the buffer during replay.
+            if hasattr(torch.compiler, "cudagraph_mark_tensor_static"):
+                torch.compiler.cudagraph_mark_tensor_static(layer_buf)
+            with contextlib.suppress(Exception):
+                torch._dynamo.mark_static_address(layer_buf)
+            bound += 1
+
+    logger.info(
+        "Bound routing capture buffer to %s FusedMoE layers. Buffer shape=%s",
+        bound,
+        tuple(buffer.shape),
+    )
diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
index 2eee8acf6b8f..3be5f91b34be 100644
--- a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py
@@ -93,6 +93,7 @@ def _moe_forward(
     shared_experts_input: torch.Tensor | None,
     input_ids: torch.Tensor | None,
     layer_name: _layer_name_type,
+    hidden_dim_unpadded: int,
 ) -> torch.Tensor:
     layer = get_layer_from_name(_resolve_layer_name(layer_name))
     return layer.runner._forward_impl(
@@ -110,7 +111,14 @@ def _moe_forward_fake(
     shared_experts_input: torch.Tensor | None,
     input_ids: torch.Tensor | None,
     layer_name: _layer_name_type,
+    hidden_dim_unpadded: int,
 ) -> torch.Tensor:
+    # `hidden_dim_unpadded > 0` only on the TRT-LLM MXFP4 path, where the
+    # real kernel writes narrower than `hidden_states.shape[-1]`. Plumbed
+    # as an op arg (not peeked from the layer registry) to keep the fake
+    # a pure shape function of its inputs and preserve subgraph dedup.
+    if hidden_dim_unpadded > 0:
+        return hidden_states.new_empty((*hidden_states.shape[:-1], hidden_dim_unpadded))
     return torch.empty_like(hidden_states)
 
 
@@ -120,6 +128,7 @@ def _moe_forward_shared(
     shared_experts_input: torch.Tensor | None,
     input_ids: torch.Tensor | None,
     layer_name: _layer_name_type,
+    hidden_dim_unpadded: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     layer = get_layer_from_name(_resolve_layer_name(layer_name))
     return layer.runner._forward_impl(
@@ -137,13 +146,17 @@ def _moe_forward_shared_fake(
     shared_experts_input: torch.Tensor | None,
     input_ids: torch.Tensor | None,
     layer_name: _layer_name_type,
+    hidden_dim_unpadded: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    # Output shapes:
-    # - fused_out: same as hidden_states (routed experts use transformed size)
-    # - shared_out: same as shared_experts_input if provided, else same as
-    #               hidden_states
-    # (For latent MoE: shared experts use original hidden_size, not latent size)
-    fused_out = torch.empty_like(hidden_states)
+    # `fused_out`: see `_moe_forward_fake` for hidden_dim_unpadded semantics.
+    # `shared_out`: matches `shared_experts_input` if provided (latent MoE),
+    # else `hidden_states`.
+    if hidden_dim_unpadded > 0:
+        fused_out = hidden_states.new_empty(
+            (*hidden_states.shape[:-1], hidden_dim_unpadded)
+        )
+    else:
+        fused_out = torch.empty_like(hidden_states)
     if shared_experts_input is not None:
         shared_out = torch.empty_like(shared_experts_input)
     else:
@@ -389,6 +402,29 @@ def _encode_layer_name(self) -> str | LayerName:
             return "from_forward_context"
         return self.layer_name
 
+    def _trtllm_mxfp4_unpadded_dim(self) -> int:
+        """Return ``hidden_dim_unpadded`` when the active backend is TRT-LLM
+        MXFP4 (whose kernel writes narrower than the padded
+        ``hidden_states.shape[-1]``), else 0. Other MXFP4 backends (notably
+        Cutlass MXFP4 MXFP8) write the full padded width, so
+        ``moe_config.hidden_dim_unpadded`` alone is insufficient: it encodes
+        the model's logical hidden, not whether the kernel narrows. Computed
+        caller-side and passed as an op arg; doing the isinstance check
+        inside the fake would specialize per ``layer_name`` and break
+        subgraph dedup for identical-architecture models (e.g. Phi-MoE).
+        """
+        from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (
+            TrtLlmMxfp4ExpertsBase,
+        )
+
+        moe_kernel = getattr(self._quant_method, "moe_kernel", None)
+        fused_experts = getattr(
+            getattr(moe_kernel, "impl", None), "fused_experts", None
+        )
+        if isinstance(fused_experts, TrtLlmMxfp4ExpertsBase):
+            return self.moe_config.hidden_dim_unpadded or self.moe_config.hidden_dim
+        return 0
+
     def _maybe_pad_hidden_states(
         self,
         shared_experts_input: torch.Tensor | None,
@@ -451,6 +487,10 @@ def _apply_quant_method(
             shared_experts_input, SharedExpertsOrder.NO_OVERLAP
         )
 
+        # Get routing replay buffer from persistent layer attribute
+        # (set by bind_routing_capture_to_model during capturer init)
+        routing_replay_out = getattr(layer, "_routing_replay_out", None)
+
         if self._quant_method.is_monolithic:
             fused_out = self._quant_method.apply_monolithic(
                 layer=layer,
@@ -465,6 +505,10 @@ def _apply_quant_method(
                 input_ids=input_ids,
             )
 
+            # Write routing data for non-monolithic path (Triton, etc.)
+            if routing_replay_out is not None:
+                routing_replay_out[: topk_ids.shape[0]].copy_(topk_ids.to(torch.int16))
+
             # Passing shared_experts_input in case SharedExpertsOrder is
             # MK_INTERNAL_OVERLAPPED.
             fused_out = self._quant_method.apply(
@@ -577,6 +621,7 @@ def forward(
             shared_experts_input,
             input_ids,
             self._encode_layer_name(),
+            self._trtllm_mxfp4_unpadded_dim(),
         )
 
         #
diff --git a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
index 4cebe608a6b4..837c1498622f 100644
--- a/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
+++ b/vllm/model_executor/layers/fused_moe/topk_weight_and_reduce.py
@@ -62,6 +62,10 @@ def apply(
         if output is None:
             return fused_expert_output
 
+        # Skip self-copy when caller aliased fused_out to output upstream.
+        if output is fused_expert_output:
+            return output
+
         # MoEPrepareAndFinalizeNoDPEPModular needs the output to be in the `output`
         # tensor.
         assert output.size() == fused_expert_output.size(), (
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 89697033403d..91de16f79c68 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -154,32 +154,52 @@ def _setup_kernel(
         w2: torch.Tensor,
     ) -> None:
         # Shuffle weights to runtime format.
-        w13, w2 = convert_to_unquantized_kernel_format(
+        w13_new, w2_new = convert_to_unquantized_kernel_format(
             self.unquantized_backend,
             layer=layer,
             w13_weight=w13,
             w2_weight=w2,
         )
-        replace_parameter(layer, "w13_weight", w13)
-        replace_parameter(layer, "w2_weight", w2)
+        # `moe_kernel` is initialized to None in FusedMoEMethodBase.__init__;
+        # On the first call we replace the parameter normally. On subsequent
+        # calls (e.g. RL weight updates that re-trigger
+        # process_weights_after_loading) the moe kernel has already been set
+        # up and CUDA graphs may have captured the parameter addresses, so
+        # we copy the shuffled data into the existing storage instead of
+        # re-registering a new Parameter.
+        is_weight_update = self.moe_kernel is not None  # type: ignore[has-type]
+        replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
+        replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)
 
-        # Setup moe kernel.
-        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-        assert self.moe_quant_config is not None
-        assert self.experts_cls is not None
-        self.moe_kernel = make_unquantized_moe_kernel(
-            quant_config=self.moe_quant_config,
-            moe_config=self.moe,
-            backend=self.unquantized_backend,
-            experts_cls=self.experts_cls,
-            routing_tables=layer._maybe_init_expert_routing_tables(),
-            shared_experts=layer.shared_experts,
-        )
+        if not is_weight_update:
+            # Setup moe kernel only on the first call. For the unquantized
+            # method, moe_quant_config is either the constant
+            # FUSED_MOE_UNQUANTIZED_CONFIG or biased_moe_quant_config(...)
+            # which references layer.w{13,2}_bias; since weight updates
+            # mutate those bias tensors in place, the kernel does not need
+            # to be re-built.
+            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+            assert self.moe_quant_config is not None
+            assert self.experts_cls is not None
+            self.moe_kernel = make_unquantized_moe_kernel(
+                quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                backend=self.unquantized_backend,
+                experts_cls=self.experts_cls,
+                routing_tables=layer._maybe_init_expert_routing_tables(),
+                shared_experts=layer.shared_experts,
+            )
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
-        # Padding the weight for better performance on ROCm
+        # Padding the weight for better performance on ROCm.
+        # _maybe_pad_weight is idempotent: on the first call it allocates a
+        # padded storage and returns a strided view; on subsequent calls
+        # (weight updates) the stride condition no longer matches so it
+        # returns the input unchanged. The reassignment to .data is therefore
+        # a no-op on updates and preserves the storage address (data_ptr)
+        # used by captured CUDA graphs.
         layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
         layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)
 
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index d9184bb77070..a5d4e4db79fe 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -8,68 +8,15 @@
 
 # Import kernels
 import vllm.kernels  # noqa: F401
-from vllm import _oink_ops, envs, ir
-from vllm._aiter_ops import rocm_aiter_ops
+from vllm import envs, ir
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.batch_invariant import (
-    rms_norm_batch_invariant,
-)
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.batch_invariant import rms_norm_batch_invariant
 
 logger = init_logger(__name__)
 
 
-def _can_view_as_2d(x: torch.Tensor) -> bool:
-    """Return True if x.view(-1, x.shape[-1]) is viewable (no copy)."""
-    if x.dim() < 2:
-        return False
-    if x.dim() == 2:
-        return True
-    # For a view(-1, N) to be valid, all leading dims must be contiguous with
-    # respect to each other (size-1 dims are ignored).
-    for dim in range(x.dim() - 1):
-        # Strides for size-1 dims are irrelevant and can be arbitrary.
-        if x.size(dim + 1) != 1 and x.stride(dim) != x.stride(dim + 1) * x.size(
-            dim + 1
-        ):
-            return False
-    return True
-
-
-def _is_oink_stride_compatible_2d(x_2d: torch.Tensor) -> bool:
-    """Return True if x_2d meets Oink's pointer-path stride constraints."""
-    if x_2d.dim() != 2:
-        return False
-    if x_2d.stride(1) != 1:
-        return False
-    # Match Oink's vectorization constraint: stride(0) divisible by 256b.
-    if x_2d.dtype in (torch.float16, torch.bfloat16):
-        divby = 16
-    elif x_2d.dtype == torch.float32:
-        divby = 8
-    else:
-        return False
-    return (x_2d.stride(0) % divby) == 0
-
-
-def fused_add_rms_norm(
-    x: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    variance_epsilon: float,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    from vllm import _custom_ops as ops
-
-    ops.fused_add_rms_norm(
-        x,
-        residual,
-        weight,
-        variance_epsilon,
-    )
-    return x, residual
-
-
 def poly_norm(
     x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, variance_epsilon: float
 ) -> torch.Tensor:
@@ -86,18 +33,6 @@ def poly_norm(
     return out
 
 
-def dispatch_rocm_rmsnorm_func(dtype: torch.dtype, use_aiter: bool = False):
-    use_aiter = use_aiter and dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-
-    if use_aiter:
-        return rocm_aiter_ops.rms_norm2d_with_add
-    else:
-        return fused_add_rms_norm
-
-
 # --8<-- [start:rms_norm]
 @CustomOp.register("rms_norm")
 class RMSNorm(CustomOp):
@@ -130,105 +65,19 @@ def __init__(
         if self.has_weight:
             self.weight = nn.Parameter(self.weight)
 
-        if current_platform.is_rocm():
-            aiter_rmsnorm_enabled = rocm_aiter_ops.is_rmsnorm_enabled()
-            self.rocm_norm_func_with_add = dispatch_rocm_rmsnorm_func(
-                dtype=weight_dtype, use_aiter=aiter_rmsnorm_enabled
-            )
-
-        # Optional: enable Oink Blackwell RMSNorm custom-op fast path on
-        # compatible CUDA devices (e.g., SM100) when the external Oink
-        # package is available. This is detected once at construction time
-        # to avoid per-call device queries in the hot path.
-        self._use_oink_fused_add_rmsnorm = False
-        if (
-            not current_platform.is_rocm()
-            and torch.cuda.is_available()
-            and bool(getattr(envs, "VLLM_USE_OINK_OPS", False))
-        ):
-            # NOTE: vLLM disables custom ops by default when using Inductor.
-            # If this op is disabled, CustomOp will dispatch to forward_native,
-            # and the Oink path in forward_cuda will never run.
-            if getattr(self._forward_method, "__func__", None) is getattr(
-                self.forward_native, "__func__", None
-            ):
-                try:
-                    from vllm.config import get_cached_compilation_config
-
-                    custom_ops = get_cached_compilation_config().custom_ops
-                except Exception:
-                    custom_ops = ["<unknown>"]
-                logger.warning_once(
-                    "VLLM_USE_OINK_OPS=1 but the `rms_norm` custom op is "
-                    "disabled (CompilationConfig.custom_ops=%s). Enable it via "
-                    "`compilation_config={'custom_ops': ['none', '+rms_norm']}` "
-                    "(or `['all']`) to let vLLM call into torch.ops.oink.*.",
-                    custom_ops,
-                )
-                # Custom op disabled => forward_cuda won't run. Avoid doing any
-                # external Oink initialization work in this case.
-            else:
-                try:
-                    device_index = torch.accelerator.current_device_index()
-                    if _oink_ops.is_oink_available_for_device(device_index):
-                        self._use_oink_fused_add_rmsnorm = (
-                            _oink_ops.has_fused_add_rms_norm()
-                        )
-                except Exception as e:
-                    # If anything goes wrong (no Oink install, CPU-only env, etc.),
-                    # silently fall back to the built-in RMSNorm path.
-                    logger.warning_once(
-                        "VLLM_USE_OINK_OPS=1 but failed to initialize Oink "
-                        "RMSNorm; falling back to vLLM RMSNorm. Error: %s",
-                        e,
-                    )
-                    self._use_oink_fused_add_rmsnorm = False
-
-    @staticmethod
-    def forward_static(
-        x: torch.Tensor,
-        variance_epsilon: float,
-        hidden_size: int,
-        orig_dtype: torch.dtype,
-        weight: torch.Tensor | None = None,
-        residual: torch.Tensor | None = None,
-        variance_size_override: int | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        """PyTorch-native implementation equivalent to forward()."""
-        x = x.to(torch.float32)
-        if residual is not None:
-            # residual promoted f16->f32 automatically,
-            # otherwise Inductor eliminates the casts to and from f16,
-            # increasing memory usage (and complicating pattern matching)
-            x = x + residual
-            residual = x.to(orig_dtype)
-
-        if x.shape[-1] != hidden_size:
-            raise ValueError(
-                f"Expected hidden_size to be {hidden_size}, but found: {x.shape[-1]}"
-            )
-
-        if variance_size_override is None:
-            x_var = x
-        else:
-            if hidden_size < variance_size_override:
-                raise ValueError(
-                    "Expected hidden_size to be at least "
-                    f"{variance_size_override}, but found: {hidden_size}"
-                )
-
-            x_var = x[:, :, :variance_size_override]
-
-        variance = x_var.pow(2).mean(dim=-1, keepdim=True)
-
-        x = x * torch.rsqrt(variance + variance_epsilon)
-        x = x.to(orig_dtype)
-        if weight is not None:
-            x = x * weight
-        if residual is None:
-            return x
-        else:
-            return x, residual
+        # Do not pass identity weight to native implementation (causes issue on TPU).
+        # Other implementations require weight to be passed even if all ones.
+        # Cheat and predict if native will be dispatched to:
+        #  1) if native is first in priority list
+        #  2) if variance_size_override is given (only supported by native impl)
+        # TODO(luka): address weight passing inconsistency:
+        # https://github.com/vllm-project/vllm/issues/39370
+        priority = get_current_vllm_config().kernel_config.ir_op_priority
+        var_override = self.variance_size_override is not None
+        native_rms_norm = priority.rms_norm[0] == "native" or var_override
+        native_add_rms_norm = priority.fused_add_rms_norm[0] == "native" or var_override
+        self.pass_weight = self.has_weight or not native_rms_norm
+        self.pass_weight_add = self.has_weight or not native_add_rms_norm
 
     def forward_native(
         self,
@@ -237,106 +86,34 @@ def forward_native(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """PyTorch-native implementation equivalent to forward()."""
         if residual is None:
-            # TODO(luka): address the weight=None passing issue more generally
             return ir.ops.rms_norm(
                 x,
-                self.weight.data if self.has_weight else None,
+                self.weight.data if self.pass_weight else None,
+                self.variance_epsilon,
+                self.variance_size_override,
+            )
+        else:
+            return ir.ops.fused_add_rms_norm.maybe_inplace(
+                x,
+                residual,
+                self.weight.data if self.pass_weight_add else None,
                 self.variance_epsilon,
                 self.variance_size_override,
             )
-
-        return self.forward_static(
-            x,
-            self.variance_epsilon,
-            self.hidden_size,
-            x.dtype,
-            self.weight.data if self.has_weight else None,
-            residual,
-            self.variance_size_override,
-        )
 
     def forward_cuda(
         self,
         x: torch.Tensor,
         residual: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if residual is None and not envs.VLLM_BATCH_INVARIANT:
-            return ir.ops.rms_norm(
-                x, self.weight.data, self.variance_epsilon, self.variance_size_override
-            )
-
-        if self.variance_size_override is not None:
-            return self.forward_native(x, residual)
-
-        # Optional Oink SM100 fast path (fused residual-add + RMSNorm, in-place).
-        # This mirrors vLLM's fused_add_rms_norm semantics by mutating both
-        # `x` (normalized output) and `residual` (residual-out buffer).
         if (
-            residual is not None
-            and getattr(self, "_use_oink_fused_add_rmsnorm", False)
-            and x.is_cuda
-            and residual.is_cuda
-            and x.shape == residual.shape
-            and x.dtype == residual.dtype
-            and x.dim() >= 2
-            and self.has_weight
-            and not envs.VLLM_BATCH_INVARIANT
-            and self.weight.data.dtype == x.dtype
-            and self.weight.data.is_contiguous()
+            envs.VLLM_BATCH_INVARIANT
+            and residual is None
+            and self.variance_size_override is None
         ):
-            orig_shape = x.shape
-            hidden_size = orig_shape[-1]
-            if _can_view_as_2d(x) and _can_view_as_2d(residual):
-                x_2d = x.view(-1, hidden_size)
-                res_2d = residual.view(-1, hidden_size)
-
-                # The Oink in-place pointer path supports the common vLLM
-                # layout where:
-                # - `x` may be strided/padded row-major (stride(1) == 1), and
-                # - `residual` is contiguous row-major ([M, N] with stride(0) == N).
-                # If these conditions are not met, fall back to vLLM's built-in
-                # fused kernel.
-                if (
-                    _is_oink_stride_compatible_2d(x_2d)
-                    and _is_oink_stride_compatible_2d(res_2d)
-                    and res_2d.is_contiguous()
-                ):
-                    _oink_ops.fused_add_rms_norm_(
-                        x_2d,
-                        res_2d,
-                        self.weight.data,
-                        self.variance_epsilon,
-                    )
-                    return x, residual
-
-        if residual is not None:
-            return fused_add_rms_norm(
-                x, residual, self.weight.data, self.variance_epsilon
-            )
-        else:
-            assert envs.VLLM_BATCH_INVARIANT
             return rms_norm_batch_invariant(x, self.weight.data, self.variance_epsilon)
 
-    def forward_hip(
-        self,
-        x: torch.Tensor,
-        residual: torch.Tensor | None = None,
-    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if residual is None and not envs.VLLM_BATCH_INVARIANT:
-            return ir.ops.rms_norm(
-                x, self.weight.data, self.variance_epsilon, self.variance_size_override
-            )
-
-        if self.variance_size_override is not None:
-            return self.forward_native(x, residual)
-
-        if residual is not None:
-            return self.rocm_norm_func_with_add(
-                x, residual, self.weight.data, self.variance_epsilon
-            )
-        else:
-            assert envs.VLLM_BATCH_INVARIANT
-            return rms_norm_batch_invariant(x, self.weight.data, self.variance_epsilon)
+        return self.forward_native(x, residual)
 
     def forward_xpu(
         self,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 6a4c1f3c47ef..765e79331d1e 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -213,6 +213,9 @@ def create_weights(
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if current_platform.is_cpu():
+            if layer.weight.ndim != 2:
+                # this is not a linear layer
+                return
             from vllm.model_executor.layers.utils import dispatch_cpu_unquantized_gemm
 
             dispatch_cpu_unquantized_gemm(layer, remove_weight=True)
@@ -268,10 +271,13 @@ def __init__(
         self.quant_config = quant_config
         self.prefix = prefix
         self.allow_fp8_block_shape_mismatch = False
+        self.quant_method: QuantizeMethodBase
         if quant_config is None:
-            self.quant_method: QuantizeMethodBase | None = UnquantizedLinearMethod()
+            self.quant_method = UnquantizedLinearMethod()
+        elif quant_method := quant_config.get_quant_method(self, prefix=prefix):
+            self.quant_method = quant_method
         else:
-            self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+            raise ValueError("All linear layers should support quant method.")
         self.return_bias = return_bias
         self.disable_tp = disable_tp
         self.tp_rank = get_tensor_model_parallel_rank() if not disable_tp else 0
@@ -335,8 +341,6 @@ def __init__(
             disable_tp=disable_tp,
         )
 
-        # All the linear layer supports quant method.
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             self,
             self.input_size,
@@ -389,7 +393,6 @@ def forward(
         x: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
         bias = self.bias if not self.skip_bias_add else None
-        assert self.quant_method is not None
 
         output = self.quant_method.apply(self, x, bias)
 
@@ -474,7 +477,6 @@ def __init__(
         self._maybe_allow_fp8_block_shape_mismatch()
         self.gather_output = gather_output
 
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
@@ -583,7 +585,6 @@ def forward(
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
-        assert self.quant_method is not None
         output_parallel = self.quant_method.apply(self, input_, bias)
 
         if self.gather_output and self.tp_size > 1:
@@ -1463,7 +1464,6 @@ def __init__(
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
 
-        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
@@ -1553,7 +1553,6 @@ def forward(
             input_parallel = split_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
-        assert self.quant_method is not None
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in TP>1 case)
         bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
diff --git a/vllm/model_executor/layers/mamba/gdn_linear_attn.py b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
index a621ab962f0a..518e9d4f0cff 100644
--- a/vllm/model_executor/layers/mamba/gdn_linear_attn.py
+++ b/vllm/model_executor/layers/mamba/gdn_linear_attn.py
@@ -8,6 +8,7 @@
 from transformers.activations import ACT2FN
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import (
     VllmConfig,
     get_current_vllm_config,
@@ -64,6 +65,20 @@
 )
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
 
+# Optional ROCm AITER Triton kernels for the GDN decode fast-path.
+# Availability is checked centrally via rocm_aiter_ops; the actual function
+# references are imported here so that they can be called without per-call
+# import overhead.
+GDN_AITER_TRITON_AVAILABLE = rocm_aiter_ops.are_gdn_triton_kernels_available()
+
+if GDN_AITER_TRITON_AVAILABLE:
+    from aiter.ops.triton.causal_conv1d_update_single_token import (
+        fused_reshape_causal_conv1d_update_single_token as gdn_aiter_fused_reshape_causal_conv1d_update_single_token,  # noqa: E501
+    )
+    from aiter.ops.triton.gated_delta_net.fused_rearrange_sigmoid_gdr import (
+        fused_rearrange_sigmoid_gated_delta_rule as gdn_aiter_fused_rearrange_sigmoid_gated_delta_rule,  # noqa: E501
+    )
+
 logger = init_logger(__name__)
 
 
@@ -169,8 +184,9 @@ def forward_cuda(
         chunk_indices: torch.Tensor | None = None,
         chunk_offsets: torch.Tensor | None = None,
         use_qk_l2norm_in_kernel: bool = True,
+        core_attn_out: torch.Tensor | None = None,
     ):
-        return fi_chunk_gated_delta_rule(
+        o, final_state = fi_chunk_gated_delta_rule(
             q=q,
             k=k,
             v=v,
@@ -181,6 +197,11 @@ def forward_cuda(
             cu_seqlens=cu_seqlens,
             use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
         )
+        if core_attn_out is not None:
+            o_flat = o.squeeze(0).reshape(-1)
+            co_flat = core_attn_out.reshape(-1)
+            co_flat[: o_flat.numel()].copy_(o_flat)
+        return o, final_state
 
     def forward_native(
         self,
@@ -195,6 +216,7 @@ def forward_native(
         chunk_indices: torch.Tensor | None = None,
         chunk_offsets: torch.Tensor | None = None,
         use_qk_l2norm_in_kernel: bool = True,
+        core_attn_out: torch.Tensor | None = None,
     ):
         return fla_chunk_gated_delta_rule(
             q=q,
@@ -208,6 +230,7 @@ def forward_native(
             chunk_indices=chunk_indices,
             chunk_offsets=chunk_offsets,
             use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel,
+            core_attn_out=core_attn_out,
         )
 
 
@@ -271,9 +294,19 @@ def __init__(
             else 0
         )
         self.gqa_interleaved_layout = gqa_interleaved_layout
-        self._forward_method = (
-            self.forward_xpu if current_platform.is_xpu() else self.forward_cuda
-        )
+        if current_platform.is_xpu():
+            self._forward_method = self.forward_xpu
+        elif current_platform.is_cpu():
+            from vllm.model_executor.layers.mamba.ops.cpu.gdn_attention import (
+                register_cpu_gdn_attention_ops,
+            )
+
+            register_cpu_gdn_attention_ops()
+            self._forward_method = self.forward_cpu
+        elif current_platform.is_rocm():
+            self._forward_method = self.forward_hip
+        else:
+            self._forward_method = self.forward_cuda
 
         # QKV
         self.conv_dim = self.key_dim * 2 + self.value_dim
@@ -290,6 +323,7 @@ def __init__(
         # we need to create qkvz_proj adaptively here.
         # When create_in_proj_qkvz is False (e.g. LoRA enabled in Qwen3.5),
         # in_proj_qkv and in_proj_z are created separately instead.
+        self.has_lora_projections = not create_in_proj_qkvz
         if create_in_proj_qkvz:
             self.in_proj_qkvz = self.create_qkvz_proj(
                 hidden_size=self.hidden_size,
@@ -490,24 +524,155 @@ def fix_query_key_value_ordering(
 
         return query, key, value, z, b, a
 
-    def rearrange_mixed_qkv(self, mixed_qkv):
-        if mixed_qkv is None:
-            return None, None, None
-        query, key, value = torch.split(
-            mixed_qkv,
+    @torch.compile(fullgraph=True)
+    def prepare_gdn_attention_core_inputs(
+        self,
+        mixed_qkvz: torch.Tensor,
+        mixed_ba: torch.Tensor,
+        num_tokens: int,
+    ):
+        """
+        Derives mixed_qkv, z, b, a from projected qkvz/ba for the GDN custom op.
+
+        For gqa_interleaved_layout (Qwen3-Next): unpack the interleaved
+        [ng, (hk + hk + np/ng*hv + np/ng*hv)] layout into contiguous qkv.
+        For non-interleaved layout (Qwen3.5): simple split along last dim.
+        """
+        if not self.gqa_interleaved_layout:
+            # Qwen3.5: weights are in [q, k, v, z] order
+            assert num_tokens == mixed_qkvz.shape[0]
+            qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
+            z_size = self.value_dim // self.tp_size
+            mixed_qkv, z_flat = mixed_qkvz.split([qkv_size, z_size], dim=-1)
+            n = mixed_qkvz.shape[0]
+            z_out = z_flat.reshape(n, -1, self.head_v_dim)
+            b, a = mixed_ba.chunk(2, dim=-1)
+            return mixed_qkv, z_out, b, a
+
+        # Qwen3-Next: interleaved GQA layout
+        base_shape_qkvz = mixed_qkvz.size()[:-1]
+        base_shape_ba = mixed_ba.size()[:-1]
+        ng = self.num_k_heads // self.tp_size
+
+        new_tensor_shape_qkvz = base_shape_qkvz + (
+            ng,
+            (
+                self.head_k_dim
+                + self.head_k_dim
+                + (self.head_v_dim + self.head_v_dim)
+                * self.num_v_heads
+                // self.num_k_heads
+            ),
+        )
+        new_tensor_shape_ba = base_shape_ba + (
+            ng,
+            2 * self.num_v_heads // self.num_k_heads,
+        )
+
+        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
+        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
+
+        split_arg_list_qkvz = [
+            self.head_k_dim,
+            self.head_k_dim,
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
+        ]
+        split_arg_list_ba = [
+            self.num_v_heads // self.num_k_heads,
+            self.num_v_heads // self.num_k_heads,
+        ]
+
+        (query, key, value, z) = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=-1)
+        (b, a) = torch.split(mixed_ba, split_arg_list_ba, dim=-1)
+
+        mixed_qkv_logical = torch.cat(
             [
-                self.key_dim // self.tp_size,
-                self.key_dim // self.tp_size,
-                self.value_dim // self.tp_size,
+                query.reshape(num_tokens, -1),
+                key.reshape(num_tokens, -1),
+                value.reshape(num_tokens, -1),
             ],
             dim=-1,
         )
-        query, key = map(
-            lambda x: rearrange(x, "l (h d) -> 1 l h d", d=self.head_k_dim),
-            (query, key),
+
+        # The split above produces non-contiguous views into the interleaved
+        # buffer.  Concatenating everything into a single flat tensor forces a
+        # contiguous copy, then slicing back out gives contiguous q/k/v/z/b/a
+        # tensors that downstream kernels require.  Doing this in one cat+slice
+        # keeps torch.compile in a single Triton graph instead of emitting
+        # separate copy kernels per tensor.  The original code used
+        # rearrange(...).contiguous() on each tensor individually.
+        fused = torch.cat(
+            [
+                mixed_qkv_logical.reshape(-1),
+                z.reshape(-1),
+                b.reshape(-1),
+                a.reshape(-1),
+            ],
+            dim=0,
+        )
+
+        curr = 0
+        qkv_numel = mixed_qkv_logical.numel()
+        z_numel = z.numel()
+        b_numel = b.numel()
+        a_numel = a.numel()
+
+        mixed_qkv_out = fused[curr : curr + qkv_numel].view(num_tokens, -1)
+        curr += qkv_numel
+
+        z_out = fused[curr : curr + z_numel].view(
+            num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim
+        )
+        curr += z_numel
+
+        b_out = fused[curr : curr + b_numel].view(
+            num_tokens, self.num_v_heads // self.tp_size
+        )
+        curr += b_numel
+
+        a_out = fused[curr : curr + a_numel].view(
+            num_tokens, self.num_v_heads // self.tp_size
+        )
+
+        return mixed_qkv_out, z_out, b_out, a_out
+
+    @torch.compile(fullgraph=True)
+    def rearrange_mixed_qkv(self, mixed_qkv):
+        """Split packed qkv into contiguous (1, seq, heads, dim) tensors.
+
+        The original code used ``rearrange(x, "l (h d) -> 1 l h d", d=...)``
+        followed by ``.contiguous()`` on each tensor.  This version flattens
+        all three splits into a single buffer via ``torch.cat`` so that
+        torch.compile emits one Triton copy kernel instead of three separate
+        contiguous() calls.
+        """
+        if mixed_qkv is None:
+            return None, None, None
+
+        seq_len = mixed_qkv.shape[0]
+        q_dim = self.key_dim // self.tp_size
+        k_dim = self.key_dim // self.tp_size
+        v_dim = self.value_dim // self.tp_size
+
+        query, key, value = torch.split(mixed_qkv, [q_dim, k_dim, v_dim], dim=-1)
+
+        fused = torch.cat(
+            [query.reshape(-1), key.reshape(-1), value.reshape(-1)], dim=0
         )
-        value = rearrange(value, "l (h d) -> 1 l h d", d=self.head_v_dim)
-        return query.contiguous(), key.contiguous(), value.contiguous()
+
+        q_size = seq_len * q_dim
+        k_size = seq_len * k_dim
+
+        q_contig = fused[0:q_size]
+        k_contig = fused[q_size : q_size + k_size]
+        v_contig = fused[q_size + k_size :]
+
+        query = q_contig.view(1, seq_len, -1, self.head_k_dim)
+        key = k_contig.view(1, seq_len, -1, self.head_k_dim)
+        value = v_contig.view(1, seq_len, -1, self.head_v_dim)
+
+        return query, key, value
 
     def forward(
         self,
@@ -516,6 +681,63 @@ def forward(
     ):
         self._forward_method(hidden_states, output)
 
+    def _output_projection(
+        self,
+        core_attn_out: torch.Tensor,
+        z: torch.Tensor,
+        output: torch.Tensor,
+        num_tokens: int,
+    ):
+        """Part 3: RMSNormGated + output linear projection.
+
+        The RMSNormGated + quant sequence is eligible for fusion
+        by the compilation pass when fuse_norm_quant is enabled.
+        """
+        z_shape_og = z.shape
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
+    def forward_hip(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        """ROCm forward using AITER Triton fused projection+attention when
+        available, otherwise falling back to the generic CUDA path."""
+        if not self.has_lora_projections and GDN_AITER_TRITON_AVAILABLE:
+            num_tokens = hidden_states.size(0)
+            projected_states_qkvz, _ = self.in_proj_qkvz(hidden_states)
+            projected_states_ba, _ = self.in_proj_ba(hidden_states)
+            projected_states_qkvz = projected_states_qkvz.view(num_tokens, -1)
+            projected_states_ba = projected_states_ba.view(num_tokens, -1)
+            core_attn_out = torch.empty(
+                (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+                dtype=hidden_states.dtype,
+                device=hidden_states.device,
+            )
+            z = torch.empty(
+                (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+                dtype=projected_states_qkvz.dtype,
+                device=projected_states_qkvz.device,
+            )
+
+            torch.ops.vllm.gdn_attention_core(
+                projected_states_qkvz,
+                projected_states_ba,
+                z,
+                core_attn_out,
+                fast_kernel=True,
+                layer_name=_encode_layer_name(self.prefix),
+            )
+
+            self._output_projection(core_attn_out, z, output, num_tokens)
+        else:
+            self.forward_cuda(hidden_states, output)
+
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
@@ -531,7 +753,7 @@ def forward_cuda(
         # ============================================================
         # Part 1: Input Projection
         # ============================================================
-        if hasattr(self, "in_proj_qkv"):
+        if self.has_lora_projections:
             # LoRA path (Qwen3.5 only): separate in_proj_qkv and in_proj_z
             mixed_qkv, _ = self.in_proj_qkv(hidden_states)
             ba, _ = self.in_proj_ba(hidden_states)
@@ -579,20 +801,14 @@ def forward_cuda(
             b,
             a,
             core_attn_out,
-            _encode_layer_name(self.prefix),
+            fast_kernel=False,
+            layer_name=_encode_layer_name(self.prefix),
         )
 
         # ============================================================
         # Part 3: Output Projection
         # ============================================================
-        z_shape_og = z.shape
-        # Reshape input data into 2D tensor
-        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
-        z = z.reshape(-1, z.shape[-1])
-        core_attn_out = self.norm(core_attn_out, z)
-        core_attn_out = core_attn_out.reshape(z_shape_og)
-        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
-        output[:num_tokens], _ = self.out_proj(core_attn_out)
+        self._output_projection(core_attn_out, z, output, num_tokens)
 
     def forward_xpu(
         self,
@@ -607,7 +823,7 @@ def forward_xpu(
         """
         num_tokens = hidden_states.size(0)
 
-        assert not hasattr(self, "in_proj_qkv"), "lora isn't supported on XPU."
+        assert not self.has_lora_projections, "lora isn't supported on XPU."
 
         # ============================================================
         # Part 1: Input Projection
@@ -645,7 +861,57 @@ def forward_xpu(
         core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
         output[:num_tokens], _ = self.out_proj(core_attn_out)
 
-    def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
+    def forward_cpu(
+        self,
+        hidden_states: torch.Tensor,
+        output: torch.Tensor,
+    ):
+        assert not hasattr(self, "in_proj_qkv"), "lora isn't supported on CPU."
+
+        mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
+        ba, _ = self.in_proj_ba(hidden_states)
+
+        if self.gqa_interleaved_layout:
+            # Qwen3-Next: unpack the interleaved GQA layout
+            query, key, value, z, b, a = self.fix_query_key_value_ordering(
+                mixed_qkvz, ba
+            )
+            query, key, value = map(
+                lambda x: rearrange(x, "l p d -> l (p d)"), (query, key, value)
+            )
+            mixed_qkv = torch.cat((query, key, value), dim=-1)
+        else:
+            # Qwen3.5: weights are already in [q, k, v, z] and [b, a] order
+            qkv_size = (self.key_dim * 2 + self.value_dim) // self.tp_size
+            z_size = self.value_dim // self.tp_size
+            mixed_qkv, z = mixed_qkvz.split([qkv_size, z_size], dim=-1)
+            z = z.reshape(z.size(0), -1, self.head_v_dim)
+            b, a = ba.chunk(2, dim=-1)
+
+        num_tokens = hidden_states.size(0)
+        core_attn_out = torch.zeros(
+            (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        torch.ops.vllm.cpu_gdn_attention_core(
+            mixed_qkv,
+            b,
+            a,
+            core_attn_out,
+            _encode_layer_name(self.prefix),
+        )
+
+        z_shape_og = z.shape
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)")
+        output[:num_tokens], _ = self.out_proj(core_attn_out)
+
+    def _warmup_prefill_kernels(self, qkv_or_qkvz: torch.Tensor, v_dim: int) -> None:
         """Warm up GDN prefill kernels during V1 profiling.
 
         During V1 profile runs, ``_forward_core`` returns early because
@@ -666,7 +932,7 @@ def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
         ``BT = chunk_size`` (64).  A single warmup pass with T = 64
         is sufficient to populate the autotuner cache.
 
-        The decode path uses ``fused_sigmoid_gating_delta_rule_update``
+        The decode path uses ``gdn_aiter_fused_rearrange_sigmoid_gated_delta_rule``
         which has fixed kernel parameters (no autotuning), so only the
         prefill (chunked) path needs warming up.
         """
@@ -674,8 +940,8 @@ def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
             return
         self._prefill_kernels_warmed_up = True
 
-        device = mixed_qkv.device
-        dtype = mixed_qkv.dtype
+        device = qkv_or_qkvz.device
+        dtype = qkv_or_qkvz.dtype
         num_k_heads = self.num_k_heads // self.tp_size
         num_v_heads = self.num_v_heads // self.tp_size
         _, state_dtype = self.get_state_dtype()
@@ -686,7 +952,7 @@ def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
         # then run chunk_gated_delta_rule with in-kernel L2 norm disabled.
         T = FLA_CHUNK_SIZE
         dummy_mixed_qkv = torch.randn(
-            T, mixed_qkv.shape[-1], device=device, dtype=dtype
+            T, qkv_or_qkvz.shape[-1] - v_dim, device=device, dtype=dtype
         )
         dummy_a = torch.randn(T, num_v_heads, device=device, dtype=dtype)
         dummy_b = torch.randn(T, num_v_heads, device=device, dtype=dtype)
@@ -749,6 +1015,66 @@ def _warmup_prefill_kernels(self, mixed_qkv: torch.Tensor) -> None:
 
         torch.accelerator.empty_cache()
 
+    def _forward_core_rocm(
+        self,
+        qkvz: torch.Tensor,
+        ba: torch.Tensor,
+        z_out: torch.Tensor,
+        core_attn_out: torch.Tensor,
+    ):
+        """ROCm AITER fast path: conv1d + recurrent attention from packed
+        qkvz/ba layout.
+
+        For decode-only (no spec, no prefill), dispatches directly to
+        ``_forward_core_decode_fast``.  Otherwise unpacks the packed
+        layout and falls through to ``_forward_core``.
+
+        Args:
+            qkvz: packed [q, k, v, z] projection (num_tokens, qkvz_dim)
+            ba:   packed [b, a] gating vectors    (num_tokens, 2*num_heads)
+            z_out: **output** buffer for z        (num_tokens, num_heads,
+                   head_dim); mutated in-place.
+            core_attn_out: Pre-allocated output buffer for attention results.
+        """
+        forward_context = get_forward_context()
+        attn_metadata_raw = forward_context.attn_metadata
+
+        if attn_metadata_raw is None:
+            v_dim = core_attn_out.shape[-1] * core_attn_out.shape[-2]
+            self._warmup_prefill_kernels(qkvz, v_dim)
+            return
+
+        assert isinstance(attn_metadata_raw, dict)
+        attn_metadata = attn_metadata_raw[self.prefix]  # type: ignore[index]
+        assert isinstance(attn_metadata, GDNAttentionMetadata)
+
+        if (
+            attn_metadata.spec_sequence_masks is None
+            and attn_metadata.num_prefills == 0
+            and attn_metadata.num_decodes > 0
+        ):
+            return self._forward_core_decode_fast(
+                qkvz=qkvz,
+                ba=ba,
+                z_out=z_out,
+                core_attn_out=core_attn_out,
+                attn_metadata=attn_metadata,
+            )
+
+        core_attn_out.zero_()
+        z_out.zero_()
+        num_tokens_all = qkvz.shape[0]
+        mixed_qkv, z, b, a = self.prepare_gdn_attention_core_inputs(
+            qkvz, ba, num_tokens_all
+        )
+        z_out[:] = z
+        self._forward_core(
+            mixed_qkv=mixed_qkv,
+            b=b,
+            a=a,
+            core_attn_out=core_attn_out,
+        )
+
     def _forward_core(
         self,
         mixed_qkv: torch.Tensor,
@@ -756,13 +1082,19 @@ def _forward_core(
         a: torch.Tensor,
         core_attn_out: torch.Tensor,
     ):
+        """Core conv1d + recurrent attention (standard path).
+
+        Args:
+            mixed_qkv: packed [q, k, v] projection (num_tokens, qkv_dim)
+            b: beta gating vector                   (num_tokens, num_heads)
+            a: alpha gating vector                  (num_tokens, num_heads)
+            core_attn_out: Pre-allocated output buffer for attention results.
+        """
         forward_context = get_forward_context()
         attn_metadata_raw = forward_context.attn_metadata
 
         if attn_metadata_raw is None:
-            # V1 profile run — warm up prefill kernels so that
-            # autotuning completes before KV cache allocation.
-            self._warmup_prefill_kernels(mixed_qkv)
+            self._warmup_prefill_kernels(mixed_qkv, 0)
             return
 
         assert isinstance(attn_metadata_raw, dict)
@@ -1008,6 +1340,72 @@ def _forward_core(
         else:
             core_attn_out[:num_actual_tokens] = core_attn_out_non_spec.squeeze(0)
 
+    def _forward_core_decode_fast(
+        self,
+        qkvz: torch.Tensor,
+        ba: torch.Tensor,
+        z_out: torch.Tensor,
+        core_attn_out: torch.Tensor,
+        attn_metadata: GDNAttentionMetadata,
+    ):
+        non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
+        non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        self_kv_cache = self.kv_cache
+        # conv_state must be (..., dim, width-1) for the conv kernels.
+        # DS layout stores it that way directly; SD layout needs a transpose.
+        conv_state = (
+            self_kv_cache[0]
+            if is_conv_state_dim_first()
+            else self_kv_cache[0].transpose(-1, -2)
+        )
+        ssm_state = self_kv_cache[1]
+
+        # 1. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+
+        mixed_qkv_non_spec, b, a = (
+            gdn_aiter_fused_reshape_causal_conv1d_update_single_token(
+                qkvz,
+                attn_metadata.num_actual_tokens,
+                self.num_k_heads // self.tp_size,
+                self.num_v_heads // self.tp_size,
+                self.head_k_dim,
+                self.head_v_dim,
+                ba,
+                z_out,
+                core_attn_out,
+                conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=non_spec_state_indices_tensor[  # type: ignore[index]
+                    : attn_metadata.num_actual_tokens
+                ],
+                validate_data=True,
+            )
+        )
+
+        # 2. Recurrent attention
+        gdn_aiter_fused_rearrange_sigmoid_gated_delta_rule(
+            A_log=self.A_log,
+            a=a,
+            b=b,
+            dt_bias=self.dt_bias,
+            qkv=mixed_qkv_non_spec,
+            key_dim=self.key_dim // self.tp_size,
+            value_dim=self.value_dim // self.tp_size,
+            head_k_dim=self.head_k_dim,
+            head_v_dim=self.head_v_dim,
+            initial_state=ssm_state,
+            inplace_final_state=True,
+            cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],  # type: ignore[index]
+            ssm_state_indices=non_spec_state_indices_tensor,
+            use_qk_l2norm_in_kernel=True,
+            core_attn_out=core_attn_out.reshape(-1),
+        )
+
     def _forward_core_decode_non_spec(
         self,
         mixed_qkv: torch.Tensor,
@@ -1064,33 +1462,51 @@ def _forward_core_decode_non_spec(
 
 
 def gdn_attention_core(
-    mixed_qkv: torch.Tensor,
-    b: torch.Tensor,
-    a: torch.Tensor,
+    qkv_or_qkvz: torch.Tensor,
+    b_or_ba: torch.Tensor,
+    a_or_z_out: torch.Tensor,
     core_attn_out: torch.Tensor,
+    fast_kernel: bool,
     layer_name: LayerNameType,
 ) -> None:
-    """
-    Custom op for the core attention computation.
-    Only handles the convolution + recurrent attention part.
-    Input/output projections are handled outside this op.
+    """Custom op dispatching to _forward_core or _forward_core_rocm.
+
+    Handles conv1d + recurrent attention only; input/output projections
+    are performed by the caller.
+
+    When ``fast_kernel=False`` (standard path):
+        qkv_or_qkvz is [q, k, v], b_or_ba is b, a_or_z_out is a (read-only).
+    When ``fast_kernel=True`` (AITER Triton fast path, ROCm only):
+        qkv_or_qkvz is [q, k, v, z], b_or_ba is [b, a], a_or_z_out is the
+        z output buffer (mutated in-place).
+
+    ``core_attn_out`` is always mutated in-place.
     """
     layer_name = _resolve_layer_name(layer_name)
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self._forward_core(
-        mixed_qkv=mixed_qkv,
-        b=b,
-        a=a,
-        core_attn_out=core_attn_out,
-    )
+    if fast_kernel:
+        self._forward_core_rocm(
+            qkvz=qkv_or_qkvz,
+            ba=b_or_ba,
+            z_out=a_or_z_out,
+            core_attn_out=core_attn_out,
+        )
+    else:
+        self._forward_core(
+            mixed_qkv=qkv_or_qkvz,
+            b=b_or_ba,
+            a=a_or_z_out,
+            core_attn_out=core_attn_out,
+        )
 
 
 def gdn_attention_core_fake(
-    mixed_qkv: torch.Tensor,
-    b: torch.Tensor,
-    a: torch.Tensor,
+    qkv_or_qkvz: torch.Tensor,
+    b_or_ba: torch.Tensor,
+    a_or_z_out: torch.Tensor,
     core_attn_out: torch.Tensor,
+    fast_kernel: bool,
     layer_name: LayerNameType,
 ) -> None:
     """Fake implementation for torch.compile."""
@@ -1100,7 +1516,7 @@ def gdn_attention_core_fake(
 direct_register_custom_op(
     op_name="gdn_attention_core",
     op_func=gdn_attention_core,
-    mutates_args=["core_attn_out"],
+    mutates_args=["a_or_z_out", "core_attn_out"],
     fake_impl=gdn_attention_core_fake,
 )
 
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/__init__.py b/vllm/model_executor/layers/mamba/ops/cpu/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py
new file mode 100644
index 000000000000..b047ca6d6169
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/causal_conv1d.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+
+
+# for prefill
+def causal_conv1d_torch(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None,
+    conv_states: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    cache_indices: torch.Tensor,
+    has_initial_state: torch.Tensor,
+    activation: str | None = "silu",
+) -> torch.Tensor:
+    out = torch.empty_like(x)
+    state_len = weight.shape[1] - 1
+    assert activation in {None, "silu", "swish"}
+
+    seq_begin_end_idx = [
+        (int(query_start_loc[idx].item()), int(query_start_loc[idx + 1].item()))
+        for idx in range(query_start_loc.shape[0] - 1)
+    ]
+    weight = weight.unsqueeze(1)
+    for seq_idx, (bos, eos) in enumerate(seq_begin_end_idx):
+        slot = int(cache_indices[seq_idx].item())
+
+        seq_x = x[:, bos:eos].unsqueeze(0)
+        if bool(has_initial_state[seq_idx].item()):
+            initial_state = conv_states[slot, :, :state_len].unsqueeze(0)
+        else:
+            initial_state = torch.zeros(
+                1,
+                weight.shape[0],
+                state_len,
+                device=seq_x.device,
+                dtype=seq_x.dtype,
+            )
+
+        conv_input = torch.cat([initial_state, seq_x], dim=-1).to(weight.dtype)
+        seq_out = F.conv1d(
+            conv_input,
+            weight,
+            bias,
+            padding=0,
+            groups=weight.shape[0],
+        )
+        seq_out = seq_out[..., -seq_x.shape[-1] :].to(dtype=x.dtype)
+        if activation in ("silu", "swish"):
+            seq_out = F.silu(seq_out)
+
+        out[:, bos:eos] = seq_out.squeeze(0)
+        conv_states[slot, :, :state_len].copy_(conv_input[..., -state_len:].squeeze(0))
+
+    return out
+
+
+# for decode
+def causal_conv1d_update_torch(
+    x: torch.Tensor,
+    conv_state: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor | None = None,
+    activation: str | None = None,
+) -> torch.Tensor:
+    assert activation in {None, "silu", "swish"}
+
+    _, dim, seq_len = x.shape
+    state_len = conv_state.shape[-1]
+
+    x_new = torch.cat([conv_state, x], dim=-1).to(weight.dtype)
+    conv_state.copy_(x_new[:, :, -state_len:])
+
+    out = F.conv1d(
+        x_new,
+        weight.unsqueeze(1),
+        bias,
+        padding=0,
+        groups=dim,
+    )[:, :, -seq_len:]
+    if activation in ("silu", "swish"):
+        out = F.silu(out)
+    return out
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py b/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py
new file mode 100644
index 000000000000..8b4122cc487b
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/gdn_attention.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from __future__ import annotations
+
+import torch
+
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.layers.mamba.mamba_utils import is_conv_state_dim_first
+from vllm.model_executor.layers.mamba.ops.cpu.causal_conv1d import (
+    causal_conv1d_torch,
+    causal_conv1d_update_torch,
+)
+from vllm.model_executor.layers.mamba.ops.cpu.recurrent_gated_delta_rule import (
+    chunk_gated_delta_rule,
+    gdn_gating,
+    recurrent_gated_delta_rule,
+)
+from vllm.utils.torch_utils import (
+    LayerNameType,
+    _resolve_layer_name,
+    direct_register_custom_op,
+)
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
+
+_CPU_GDN_ATTENTION_OPS_REGISTERED = False
+
+
+def cpu_gdn_attention_core(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: LayerNameType,
+) -> None:
+    """CPU custom op for the core GDN attention computation."""
+    layer_name = _resolve_layer_name(layer_name)
+    forward_context: ForwardContext = get_forward_context()
+    layer = forward_context.no_compile_layers[layer_name]
+
+    attn_metadata = forward_context.attn_metadata
+
+    if attn_metadata is None:
+        return
+
+    assert isinstance(attn_metadata, dict)
+    attn_metadata_i = attn_metadata[layer.prefix]
+    assert isinstance(attn_metadata_i, GDNAttentionMetadata)
+
+    if attn_metadata_i.num_actual_tokens == 0:
+        return
+
+    assert (
+        attn_metadata_i.spec_sequence_masks is None
+        and attn_metadata_i.num_accepted_tokens is None
+    ), "speculative decode not supported in CPU GDN attention."
+
+    state_indices_tensor = attn_metadata_i.non_spec_state_indices_tensor
+    query_start_loc = attn_metadata_i.non_spec_query_start_loc
+    assert state_indices_tensor is not None
+    assert query_start_loc is not None
+
+    # [num_allocated_slots, conv_dim, kernel - 1]
+    conv_state = layer.kv_cache[0]
+    if not is_conv_state_dim_first():
+        conv_state = conv_state.transpose(-1, -2)
+
+    # [num_allocated_slots, num_v_heads / tp_size, v_dim, k_dim]
+    ssm_state = layer.kv_cache[1]
+
+    num_decodes = attn_metadata_i.num_decodes
+    num_decode_tokens = attn_metadata_i.num_decode_tokens
+    num_prefills = attn_metadata_i.num_prefills
+    num_prefill_tokens = attn_metadata_i.num_prefill_tokens
+
+    conv_weights = layer.conv1d.weight.view(
+        layer.conv1d.weight.size(0), layer.conv1d.weight.size(2)
+    )
+
+    # all decode requests (batched)
+    if num_decodes > 0:
+        decode_mixed_qkv = mixed_qkv[:num_decode_tokens]
+        decode_b = b[:num_decode_tokens]
+        decode_a = a[:num_decode_tokens]
+        decode_state_indices = state_indices_tensor[:num_decodes]
+        decode_conv_state = conv_state[decode_state_indices].contiguous()
+
+        decode_mixed_qkv = causal_conv1d_update_torch(
+            # [B, dim] -> [B, dim, 1]
+            x=decode_mixed_qkv.unsqueeze(-1),
+            conv_state=decode_conv_state,
+            weight=conv_weights,
+            bias=layer.conv1d.bias,
+            activation=layer.activation,
+        ).squeeze(-1)
+        conv_state[decode_state_indices] = decode_conv_state
+
+        query, key, value = layer.rearrange_mixed_qkv(decode_mixed_qkv)
+
+        # [1, L, H, D] -> [B, 1, H, D] for batched decode
+        query = query.transpose(0, 1).contiguous()
+        key = key.transpose(0, 1).contiguous()
+        value = value.transpose(0, 1).contiguous()
+
+        g, beta_output = gdn_gating(
+            A_log=layer.A_log,
+            a=decode_a,
+            b=decode_b,
+            dt_bias=layer.dt_bias,
+        )
+        if g.ndim == 2:
+            g = g.unsqueeze(1)
+            beta_output = beta_output.unsqueeze(1)
+
+        initial_state = ssm_state[decode_state_indices].contiguous()
+        attn_out, last_recurrent_state = recurrent_gated_delta_rule(
+            query=query,
+            key=key,
+            value=value,
+            g=g,
+            beta=beta_output,
+            initial_state=initial_state,
+            scale=None,
+            use_qk_l2norm_in_kernel=True,
+        )
+        ssm_state[decode_state_indices] = last_recurrent_state.to(
+            ssm_state.dtype
+        ).contiguous()
+        core_attn_out[:num_decode_tokens] = attn_out.squeeze(1)
+
+    # all prefill requests: (varlen) currently naively loops over sequences
+    if num_prefills > 0:
+        has_initial_state = attn_metadata_i.has_initial_state
+        assert has_initial_state is not None
+
+        prefill_token_start = num_decode_tokens
+        prefill_token_end = prefill_token_start + num_prefill_tokens
+        prefill_mixed_qkv = mixed_qkv[prefill_token_start:prefill_token_end]
+        prefill_b = b[prefill_token_start:prefill_token_end]
+        prefill_a = a[prefill_token_start:prefill_token_end]
+        prefill_state_indices = state_indices_tensor[
+            num_decodes : num_decodes + num_prefills
+        ]
+        prefill_query_start_loc = (
+            query_start_loc[num_decodes : num_decodes + num_prefills + 1]
+            - num_decode_tokens
+        )
+        prefill_has_initial_state = has_initial_state[
+            num_decodes : num_decodes + num_prefills
+        ]
+
+        prefill_mixed_qkv = causal_conv1d_torch(
+            x=prefill_mixed_qkv.transpose(0, 1),
+            weight=conv_weights,
+            bias=layer.conv1d.bias,
+            conv_states=conv_state,
+            query_start_loc=prefill_query_start_loc,
+            cache_indices=prefill_state_indices,
+            has_initial_state=prefill_has_initial_state,
+            activation=layer.activation,
+        ).transpose(0, 1)
+
+        query, key, value = layer.rearrange_mixed_qkv(prefill_mixed_qkv)
+        g, beta = gdn_gating(layer.A_log, prefill_a, prefill_b, layer.dt_bias)
+        if g.ndim == 2:
+            g = g.unsqueeze(0)
+            beta = beta.unsqueeze(0)
+
+        initial_state = ssm_state[prefill_state_indices].contiguous()
+        initial_state[~prefill_has_initial_state, ...] = 0
+        attn_out, last_recurrent_state = chunk_gated_delta_rule(
+            q=query,
+            k=key,
+            v=value,
+            g=g,
+            beta=beta,
+            scale=None,
+            initial_state=initial_state,
+            cu_seqlens=prefill_query_start_loc,
+            use_qk_l2norm_in_kernel=True,
+        )
+        ssm_state[prefill_state_indices] = last_recurrent_state.to(ssm_state.dtype)
+        core_attn_out[prefill_token_start:prefill_token_end] = attn_out.squeeze(0)
+
+
+def cpu_gdn_attention_core_fake(
+    mixed_qkv: torch.Tensor,
+    b: torch.Tensor,
+    a: torch.Tensor,
+    core_attn_out: torch.Tensor,
+    layer_name: LayerNameType,
+) -> None:
+    """Fake implementation for torch.compile."""
+    return
+
+
+def register_cpu_gdn_attention_ops() -> None:
+    global _CPU_GDN_ATTENTION_OPS_REGISTERED
+    if _CPU_GDN_ATTENTION_OPS_REGISTERED:
+        return
+
+    direct_register_custom_op(
+        op_name="cpu_gdn_attention_core",
+        op_func=cpu_gdn_attention_core,
+        mutates_args=["core_attn_out"],
+        fake_impl=cpu_gdn_attention_core_fake,
+    )
+    _CPU_GDN_ATTENTION_OPS_REGISTERED = True
diff --git a/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py b/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py
new file mode 100644
index 000000000000..30fca3423a38
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/cpu/recurrent_gated_delta_rule.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import torch
+import torch.nn.functional as F
+
+
+def l2norm(
+    x: torch.Tensor,
+    dim: int = -1,
+    eps: float = 1e-6,
+) -> torch.Tensor:
+    inv_norm = torch.rsqrt((x * x).sum(dim=dim, keepdim=True) + eps)
+    return x * inv_norm
+
+
+def recurrent_gated_delta_rule(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    initial_state: torch.Tensor,
+    scale: float | None = None,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    initial_dtype = query.dtype
+    if use_qk_l2norm_in_kernel:
+        query = l2norm(query, dim=-1, eps=1e-6)
+        key = l2norm(key, dim=-1, eps=1e-6)
+
+    if query.shape[2] != value.shape[2]:
+        repeat_factor = value.shape[2] // query.shape[2]
+        query = query.repeat_interleave(repeat_factor, dim=2)
+        key = key.repeat_interleave(repeat_factor, dim=2)
+
+    query, key, value, beta, g = [
+        x.transpose(1, 2).contiguous().to(torch.float32)
+        for x in (query, key, value, beta, g)
+    ]
+
+    batch_size, num_heads, sequence_length, _ = key.shape
+    v_head_dim = value.shape[-1]
+    if scale is None:
+        scale = 1 / (query.shape[-1] ** 0.5)
+    query = query * scale
+
+    core_attn_out = torch.empty(
+        batch_size,
+        num_heads,
+        sequence_length,
+        v_head_dim,
+        dtype=value.dtype,
+    )
+    last_recurrent_state = initial_state.to(value)
+
+    for token_idx in range(sequence_length):
+        q_t = query[:, :, token_idx]
+        k_t = key[:, :, token_idx]
+        v_t = value[:, :, token_idx]
+        g_t = g[:, :, token_idx].exp().unsqueeze(-1).unsqueeze(-1)
+        beta_t = beta[:, :, token_idx].unsqueeze(-1)
+
+        last_recurrent_state = last_recurrent_state * g_t
+        kv_mem = (last_recurrent_state * k_t.unsqueeze(-2)).sum(dim=-1)
+        delta = (v_t - kv_mem) * beta_t
+        last_recurrent_state = last_recurrent_state + delta.unsqueeze(
+            -1
+        ) * k_t.unsqueeze(-2)
+        core_attn_out[:, :, token_idx] = (last_recurrent_state * q_t.unsqueeze(-2)).sum(
+            dim=-1
+        )
+
+    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
+    return core_attn_out, last_recurrent_state
+
+
+def gdn_gating(
+    A_log: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    dt_bias: torch.Tensor,
+    beta: float = 1.0,
+    threshold: float = 20.0,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    softplus_x = F.softplus(a.float() + dt_bias.float(), beta=beta, threshold=threshold)
+    g = -torch.exp(A_log.float()) * softplus_x
+    beta_output = torch.sigmoid(b.float()).to(dtype=b.dtype)
+    return g, beta_output
+
+
+def chunk_gated_delta_rule(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    *,
+    initial_state: torch.Tensor,
+    scale: float | None = None,
+    cu_seqlens: torch.Tensor,
+    use_qk_l2norm_in_kernel: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(v)
+    state_dtype = initial_state.dtype
+    chunk_size = 128
+    sequence_bounds = [
+        (
+            seq_idx,
+            int(cu_seqlens[seq_idx].item()),
+            int(cu_seqlens[seq_idx + 1].item()),
+        )
+        for seq_idx in range(len(cu_seqlens) - 1)
+    ]
+    chunk_eye = torch.eye(chunk_size, dtype=torch.float32)
+    num_sequences = len(sequence_bounds)
+    num_value_heads = v.shape[2]
+    value_head_dim = v.shape[3]
+    key_head_dim = k.shape[3]
+    final_state = torch.empty(
+        (num_sequences, num_value_heads, value_head_dim, key_head_dim),
+        dtype=state_dtype,
+    )
+
+    for seq_idx, begin, end in sequence_bounds:
+        q_seq = q[:, begin:end]
+        k_seq = k[:, begin:end]
+        v_seq = v[:, begin:end]
+        g_seq = g[:, begin:end]
+        beta_seq = beta[:, begin:end]
+
+        initial_dtype = q_seq.dtype
+        if use_qk_l2norm_in_kernel:
+            q_seq = l2norm(q_seq, dim=-1, eps=1e-6)
+            k_seq = l2norm(k_seq, dim=-1, eps=1e-6)
+
+        num_qk_heads = q_seq.shape[2]
+        num_value_heads = v_seq.shape[2]
+        if num_qk_heads != num_value_heads:
+            repeat_factor = num_value_heads // num_qk_heads
+            q_seq = q_seq.repeat_interleave(repeat_factor, dim=2)
+            k_seq = k_seq.repeat_interleave(repeat_factor, dim=2)
+
+        q_seq, k_seq, v_seq, beta_seq, g_seq = [
+            x.transpose(1, 2).contiguous().to(torch.float32)
+            for x in (q_seq, k_seq, v_seq, beta_seq, g_seq)
+        ]
+        seq_batch_size, num_heads, seq_len, qk_head_dim = q_seq.shape
+        value_head_dim = v_seq.shape[-1]
+
+        if scale is None:
+            scale = 1 / (qk_head_dim**0.5)
+
+        q_seq = q_seq * scale
+
+        seq_state = initial_state[seq_idx : seq_idx + 1].to(v_seq)
+        seq_output = torch.empty(
+            seq_batch_size,
+            num_heads,
+            seq_len,
+            value_head_dim,
+            dtype=v_seq.dtype,
+        )
+
+        for chunk_start in range(0, seq_len, chunk_size):
+            chunk_end = min(chunk_start + chunk_size, seq_len)
+            q_chunk = q_seq[:, :, chunk_start:chunk_end]
+            k_chunk = k_seq[:, :, chunk_start:chunk_end]
+            v_chunk = v_seq[:, :, chunk_start:chunk_end]
+            beta_chunk = beta_seq[:, :, chunk_start:chunk_end]
+            g_chunk = g_seq[:, :, chunk_start:chunk_end]
+            chunk_len = chunk_end - chunk_start
+
+            cum_g = g_chunk.cumsum(dim=-1)
+            exp_cum_g = cum_g.exp()
+            decay = (cum_g.unsqueeze(-1) - cum_g.unsqueeze(-2)).exp()
+
+            interaction = (k_chunk * beta_chunk.unsqueeze(-1)) @ k_chunk.transpose(
+                -1, -2
+            )
+            interaction = torch.tril(interaction * decay, diagonal=-1)
+            system = interaction + chunk_eye[:chunk_len, :chunk_len]
+
+            solved_values = torch.linalg.solve_triangular(
+                system,
+                v_chunk * beta_chunk.unsqueeze(-1),
+                upper=False,
+            )
+            solved_keys = torch.linalg.solve_triangular(
+                system,
+                (k_chunk * beta_chunk.unsqueeze(-1)) * exp_cum_g.unsqueeze(-1),
+                upper=False,
+            )
+
+            incoming_memory = torch.einsum("bhvk,bhck->bhcv", seq_state, solved_keys)
+            transformed_values = solved_values - incoming_memory
+
+            # Each chunk contributes both from the incoming recurrent state and
+            # from its own in-chunk interactions.
+            inter_chunk = torch.einsum(
+                "bhvk,bhck->bhcv",
+                seq_state,
+                q_chunk * exp_cum_g.unsqueeze(-1),
+            )
+            intra_chunk = torch.tril((q_chunk @ k_chunk.transpose(-1, -2)) * decay)
+            seq_output[:, :, chunk_start:chunk_end] = (
+                inter_chunk + intra_chunk @ transformed_values
+            )
+
+            # Carry the recurrent state forward to the next chunk boundary.
+            end_decay = (cum_g[:, :, -1:] - cum_g).exp().unsqueeze(-1)
+            decayed_keys = k_chunk * end_decay
+            seq_state = seq_state * exp_cum_g[:, :, -1, None, None] + torch.einsum(
+                "bhcv,bhck->bhvk", transformed_values, decayed_keys
+            )
+
+        output[0, begin:end].copy_(
+            seq_output.transpose(1, 2).contiguous().to(initial_dtype).squeeze(0)
+        )
+        final_state[seq_idx].copy_(seq_state.squeeze(0).to(state_dtype).contiguous())
+
+    return output, final_state
diff --git a/vllm/model_executor/layers/mhc.py b/vllm/model_executor/layers/mhc.py
index f5c1f06844b0..cbc5ec2962ec 100644
--- a/vllm/model_executor/layers/mhc.py
+++ b/vllm/model_executor/layers/mhc.py
@@ -234,6 +234,39 @@ def mhc_pre(
     num_tokens = residual_flat.shape[0]
     fn_flat = fn
 
+    if current_platform.is_rocm():
+        x = residual_flat.view(num_tokens, hc_mult * hidden_size).to(torch.float32)
+        mixes = torch.matmul(x, fn_flat.t())
+        sqrsum = x.square().sum(dim=-1, keepdim=True)
+        mixes = mixes * torch.rsqrt(sqrsum / (hc_mult * hidden_size) + rms_eps)
+
+        pre_logits = mixes[:, :hc_mult] * hc_scale[0] + hc_base[:hc_mult]
+        pre_mix = torch.sigmoid(pre_logits) + hc_pre_eps
+
+        post_logits = (
+            mixes[:, hc_mult : 2 * hc_mult] * hc_scale[1]
+            + hc_base[hc_mult : 2 * hc_mult]
+        )
+        post_mix = torch.sigmoid(post_logits) * hc_post_mult_value
+
+        comb_logits = mixes[:, 2 * hc_mult :].view(
+            num_tokens, hc_mult, hc_mult
+        ) * hc_scale[2] + hc_base[2 * hc_mult :].view(1, hc_mult, hc_mult)
+        comb_mix = torch.softmax(comb_logits, dim=-1) + hc_sinkhorn_eps
+        comb_mix = comb_mix / (comb_mix.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
+        for _ in range(sinkhorn_repeat - 1):
+            comb_mix = comb_mix / (comb_mix.sum(dim=-1, keepdim=True) + hc_sinkhorn_eps)
+            comb_mix = comb_mix / (comb_mix.sum(dim=-2, keepdim=True) + hc_sinkhorn_eps)
+
+        layer_input = torch.sum(
+            pre_mix.unsqueeze(-1) * residual_flat.to(torch.float32), dim=1
+        ).to(torch.bfloat16)
+        return (
+            post_mix.view(*outer_shape, hc_mult, 1),
+            comb_mix.view(*outer_shape, hc_mult, hc_mult),
+            layer_input.view(*outer_shape, hidden_size),
+        )
+
     # these number are from deepgemm kernel impl
     block_k = 64
     block_m = 64
@@ -414,6 +447,14 @@ def mhc_post(
     post_layer_mix: torch.Tensor,
     comb_res_mix: torch.Tensor,
 ) -> torch.Tensor:
+    if current_platform.is_rocm():
+        mixed_residual = torch.einsum(
+            "...ij,...ih->...jh",
+            comb_res_mix.to(torch.float32),
+            residual.to(torch.float32),
+        )
+        post_term = post_layer_mix.to(torch.float32) * x.unsqueeze(-2).to(torch.float32)
+        return (mixed_residual + post_term).to(residual.dtype)
     out = torch.empty_like(residual)
     mhc_post_tilelang(
         comb_res_mix,
@@ -551,6 +592,49 @@ def hc_head_fuse_tilelang(
         T.pdl_trigger()
 
 
+def _hc_head_fused_reference(
+    hs_flat: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    out: torch.Tensor,
+    hidden_size: int,
+    rms_eps: float,
+    hc_eps: float,
+    hc_mult: int,
+) -> None:
+    """Pure-PyTorch reference for `hc_head_fuse_tilelang`.
+
+    Used on platforms where the tilelang HIP/CUDA backend is not available
+    (e.g. ROCm builds shipping a tilelang wheel without `target.build.tilelang_hip`).
+    Mirrors the math of the tilelang kernel exactly:
+
+        x      = hs_flat.flatten(-2, -1)                # (T, hc_mult * H), fp32
+        mixes  = x @ fn.T                               # (T, hc_mult)
+        rsqrt  = 1 / sqrt(||x||^2 / (hc_mult * H) + rms_eps)
+        pre[m] = sigmoid(mixes[m] * rsqrt * hc_scale[0] + hc_base[m]) + hc_eps
+        out    = sum_m pre[m] * hs_flat[:, m, :]        # cast back to bf16
+
+    `out` is mutated in place to keep the same op contract
+    (`mutates_args=["out"]`).
+    """
+    num_tokens = hs_flat.shape[0]
+    if num_tokens == 0:
+        return
+    x = hs_flat.reshape(num_tokens, hc_mult * hidden_size).to(torch.float32)
+    # fn: (hc_mult, hc_mult * hidden_size) → mixes: (T, hc_mult)
+    mixes = torch.matmul(x, fn.t())
+    sqrsum = x.square().sum(dim=-1, keepdim=True)
+    rsqrt = torch.rsqrt(sqrsum / (hc_mult * hidden_size) + rms_eps)
+    # hc_scale has shape (1,); hc_base has shape (hc_mult,)
+    pre_mix = torch.sigmoid(mixes * rsqrt * hc_scale[0] + hc_base) + hc_eps
+    # weighted sum over the hc_mult channel dim
+    result = torch.sum(pre_mix.unsqueeze(-1) * hs_flat.to(torch.float32), dim=1).to(
+        out.dtype
+    )
+    out.copy_(result)
+
+
 def _hc_head_fused_kernel(
     hs_flat: torch.Tensor,
     fn: torch.Tensor,
@@ -563,8 +647,15 @@ def _hc_head_fused_kernel(
     hc_mult: int,
 ) -> None:
     """Fill pre-allocated `out` (T, H) in-place with the hc_head result."""
-    if hs_flat.shape[0] > 0:
-        hc_head_fuse_tilelang(
+    if hs_flat.shape[0] == 0:
+        return
+    if current_platform.is_rocm():
+        # tilelang ships only the CUDA codegen in upstream wheels, so the HIP
+        # FFI target (`target.build.tilelang_hip`) is missing and the JIT call
+        # would raise `ValueError: Cannot find global function ...`. Use a
+        # numerically equivalent torch fallback instead. `mhc_pre` and
+        # `mhc_post` already follow this same pattern above.
+        _hc_head_fused_reference(
             hs_flat,
             fn,
             hc_scale,
@@ -575,6 +666,18 @@ def _hc_head_fused_kernel(
             hc_eps,
             hc_mult,
         )
+        return
+    hc_head_fuse_tilelang(
+        hs_flat,
+        fn,
+        hc_scale,
+        hc_base,
+        out,
+        hidden_size,
+        rms_eps,
+        hc_eps,
+        hc_mult,
+    )
 
 
 direct_register_custom_op(
diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py
index b967ff4ede7b..82170b5fbdc4 100644
--- a/vllm/model_executor/layers/pooler/seqwise/methods.py
+++ b/vllm/model_executor/layers/pooler/seqwise/methods.py
@@ -68,21 +68,23 @@ def forward(
             "partial prefill not supported with MEAN pooling"
         )
 
-        prompt_lens = pooling_cursor.prompt_lens_cpu.to(
-            hidden_states.device, dtype=torch.int64, non_blocking=True
-        )
-
-        num_seqs = prompt_lens.numel()
+        prompt_lens_cpu = pooling_cursor.prompt_lens_cpu
+        num_seqs = prompt_lens_cpu.numel()
         hidden_size = hidden_states.shape[-1]
 
         if num_seqs == 0:
             # early return for empty batch
             return hidden_states.new_empty((0, hidden_size), dtype=torch.float32)
 
-        # eg. [2, 1, 3] -> [0, 0, 1, 2, 2, 2]
+        # Build segment_ids on CPU so repeat_interleave doesn't need to sync
+        # GPU->CPU to learn its data-dependent output length, then upload
+        # non-blocking. eg. [2, 1, 3] -> [0, 0, 1, 2, 2, 2]
         segment_ids = torch.repeat_interleave(
-            torch.arange(num_seqs, device=hidden_states.device, dtype=torch.long),
-            prompt_lens,
+            torch.arange(num_seqs, dtype=torch.long),
+            prompt_lens_cpu,
+        ).to(hidden_states.device, non_blocking=True)
+        prompt_lens = prompt_lens_cpu.to(
+            hidden_states.device, dtype=torch.int64, non_blocking=True
         )
         segment_sums = torch.zeros(
             (num_seqs, hidden_size),
diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py
index d06663b5b947..ae5926cd62ff 100644
--- a/vllm/model_executor/layers/pooler/special.py
+++ b/vllm/model_executor/layers/pooler/special.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import dataclasses
 from collections.abc import Mapping, Set
 from itertools import groupby
 
@@ -80,9 +81,11 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         poolers_by_task = self.poolers_by_task
+        cursor = pooling_metadata.pooling_cursor
 
         outputs = list[torch.Tensor | None]()
         offset = 0
+        token_offset = 0
         for task, group in groupby(pooling_metadata.tasks):
             if not (pooler := poolers_by_task.get(task)):
                 raise ValueError(
@@ -91,10 +94,37 @@ def forward(
                 )
 
             num_items = len(list(group))
-            group_output: PoolerOutput = pooler(
-                hidden_states,
-                pooling_metadata[offset : offset + num_items],
-            )
+            group_metadata = pooling_metadata[offset : offset + num_items]
+            if cursor is None:
+                group_hidden_states = hidden_states
+            else:
+                # Slice out this group's tokens so sub-poolers see only their
+                # portion of the batch. Token offset is computed from the CPU
+                # `num_scheduled_tokens_cpu` to avoid a GPU->CPU sync.
+                group_cursor = group_metadata.pooling_cursor
+                num_group_tokens = int(group_cursor.num_scheduled_tokens_cpu.sum())
+                group_hidden_states = hidden_states[
+                    token_offset : token_offset + num_group_tokens
+                ]
+                if token_offset:
+                    # Shift first/last indices to be relative to the slice
+                    # so seqwise poolers (which index `hidden_states` directly)
+                    # remain correct.
+                    pooling_cursor = dataclasses.replace(
+                        group_cursor,
+                        first_token_indices_gpu=(
+                            group_cursor.first_token_indices_gpu - token_offset
+                        ),
+                        last_token_indices_gpu=(
+                            group_cursor.last_token_indices_gpu - token_offset
+                        ),
+                    )
+                    group_metadata = dataclasses.replace(
+                        group_metadata, pooling_cursor=pooling_cursor
+                    )
+                token_offset += num_group_tokens
+
+            group_output: PoolerOutput = pooler(group_hidden_states, group_metadata)
 
             outputs.extend(group_output)
             offset += num_items
diff --git a/vllm/model_executor/layers/pooler/tokwise/methods.py b/vllm/model_executor/layers/pooler/tokwise/methods.py
index d3fefb745cfe..59b7234661b5 100644
--- a/vllm/model_executor/layers/pooler/tokwise/methods.py
+++ b/vllm/model_executor/layers/pooler/tokwise/methods.py
@@ -47,17 +47,12 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> list[TokenPoolingMethodOutputItem]:
         pooling_cursor = pooling_metadata.get_pooling_cursor()
-        split_sizes = pooling_cursor.num_scheduled_tokens_cpu.tolist()
-        if split_sizes:
-            # DispatchPooler passes the full hidden_states tensor.
-            # slice out the subgroup once, then split it by
-            # per-request token counts
-            group_start = int(pooling_cursor.first_token_indices_gpu[0].item())
-            group_end = int(pooling_cursor.last_token_indices_gpu[-1].item()) + 1
-            hidden_states_group = hidden_states[group_start:group_end]
-            hidden_states_lst = list(hidden_states_group.split(split_sizes))
-        else:
-            hidden_states_lst = []
+        # Use the already-CPU num_scheduled_tokens tensor so `.tolist()`
+        # doesn't trigger a GPU->CPU sync. torch.split produces the same
+        # consecutive slices as indexing with first/last per-sequence indices.
+        hidden_states_lst = list(
+            torch.split(hidden_states, pooling_cursor.num_scheduled_tokens_cpu.tolist())
+        )
 
         if not self.enable_chunked_prefill:
             return hidden_states_lst
@@ -95,12 +90,14 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> list[TokenPoolingMethodOutputItem]:
         pooled_data_lst = super().forward(hidden_states, pooling_metadata)
-        prompt_token_ids = pooling_metadata.get_prompt_token_ids()
+        # Use the CPU copy of prompt_token_ids so the step_tag_id mask can be
+        # resolved to indices without a d2h sync from boolean indexing.
+        prompt_token_ids_cpu = pooling_metadata.get_prompt_token_ids_cpu()
         pooling_params = pooling_metadata.pooling_params
 
         pooled_data = list[torch.Tensor | None]()
-        for data, token_id, pooling_param in zip(
-            pooled_data_lst, prompt_token_ids, pooling_params
+        for data, token_id_cpu, pooling_param in zip(
+            pooled_data_lst, prompt_token_ids_cpu, pooling_params
         ):
             # for unfinished chunked prefill
             if data is None:
@@ -113,7 +110,9 @@ def forward(
                     data = data[:, returned_token_ids]
 
                 if step_tag_id is not None:
-                    data = data[token_id == step_tag_id]
+                    idx_cpu = (token_id_cpu == step_tag_id).nonzero(as_tuple=True)[0]
+                    idx = idx_cpu.to(data.device, non_blocking=True)
+                    data = data[idx]
 
                 pooled_data.append(data)
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 8d16a143b10a..2910e63678fe 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from contextlib import suppress
 from functools import partial
 from typing import TYPE_CHECKING, Any, Literal, cast
 
@@ -747,13 +746,13 @@ def get_scheme(
             self.sparsity_ignore_list
         )
         sparsity_scheme: SparsityCompressionConfig | None = None
-        with suppress(ValueError):
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=sparsity_targets,
-                fused_mapping=self.packed_modules_mapping,
-            )
+        matched_target = find_matched_target(
+            layer_name=layer_name,
+            module=layer,
+            targets=sparsity_targets,
+            fused_mapping=self.packed_modules_mapping,
+        )
+        if matched_target is not None:
             sparsity_scheme = self.sparsity_scheme_map[matched_target]
 
         if self.supports_cutlass_24(
@@ -821,10 +820,11 @@ def get_scheme_dict(
                 targets=self.target_scheme_map.keys(),
                 fused_mapping=self.packed_modules_mapping,
             )
-            scheme_dict = self.target_scheme_map[matched_target]
-            if scheme_dict.get("format") is None:
-                scheme_dict["format"] = self.quant_format
-            return scheme_dict
+            if matched_target is not None:
+                scheme_dict = self.target_scheme_map[matched_target]
+                if scheme_dict.get("format") is None:
+                    scheme_dict["format"] = self.quant_format
+                return scheme_dict
 
         return None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 04c64d9bd56f..def4797b1396 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -115,7 +115,7 @@ def find_matched_target(
     module: Module,
     targets: Iterable[str],
     fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
-) -> str:
+) -> str | None:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
@@ -150,12 +150,6 @@ def find_matched_target(
         or _match_fused_layer(layer_name, targets, fused_mapping)
     )
 
-    if matched_target is None:
-        raise ValueError(
-            f"Unable to find matching target for {layer_name} in the "
-            "compressed-tensors config."
-        )
-
     return matched_target
 
 
diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py
index aea1067ff262..6853013af88d 100644
--- a/vllm/model_executor/layers/quantization/cpu_wna16.py
+++ b/vllm/model_executor/layers/quantization/cpu_wna16.py
@@ -287,8 +287,11 @@ def _process_weights_sglang_int4(self, layer: torch.nn.Module) -> None:
         packed_weight = layer.qweight.data
         packed_zeros = layer.qzeros.data
         scales = layer.scales.data
-        blocked_w, blocked_zp, blocked_s = torch.ops._C.convert_weight_packed_scale_zp(
-            packed_weight, packed_zeros, scales
+        blocked_w, blocked_zp, blocked_s = ops.convert_weight_packed_scale_zp(
+            packed_weight,
+            packed_zeros,
+            scales,
+            ops.CPUQuantAlgo.AWQ,
         )
 
         layer.packed_weight = blocked_w
@@ -334,18 +337,13 @@ def _apply_sglang_int4(
         bias: torch.Tensor | None = None,
     ) -> torch.Tensor:
         """SGLang INT4 W4A8 GEMM path."""
-        x_shape = x.shape
-        x_2d = x.reshape(-1, x_shape[-1]) if len(x_shape) > 2 else x
-
-        out = torch.ops._C.int4_scaled_mm_cpu(
-            x_2d,
+        return ops.int4_scaled_mm_cpu(
+            x,
             layer.packed_weight,
             layer.packed_qzeros,
             layer.packed_scales,
             bias,
         )
-        out = out.reshape(x_shape[:-1] + (out.size(-1),)) if len(x_shape) > 2 else out
-        return out
 
 
 def _get_isa_hint(dtype: torch.dtype) -> str:
diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py
index 59f9c2ee9b97..79a1057c6003 100644
--- a/vllm/model_executor/layers/quantization/humming.py
+++ b/vllm/model_executor/layers/quantization/humming.py
@@ -9,11 +9,9 @@
 import torch
 
 from vllm import envs
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
-    FusedMoEQuantDesc,
 )
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE,
@@ -32,7 +30,6 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.parameter import (
     BasevLLMParameter,
@@ -215,6 +212,15 @@ def from_config(cls, config: dict[str, Any]) -> "HummingConfig":
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant, hf_config=None
     ) -> QuantizationMethods | None:
+        if user_quant == "humming" and hf_config is not None:
+            model_type = hf_config.model_type
+            quant_method = hf_quant_cfg.get("quant_method", None)
+            if model_type == "gpt_oss" and quant_method == "mxfp4":
+                msg = (
+                    "For gpt-oss model, use '--moe-backend humming' "
+                    "instead of '--quantization humming'."
+                )
+                raise ValueError(msg)
         return "humming" if user_quant == "humming" else None
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
@@ -299,8 +305,6 @@ def get_quant_config_for_layer(
                 force_weight_schema = schema
 
         if weight_schema is not None:
-            if weight_schema.quant_method == "gpt_oss_mxfp4" and layer_type != "moe":
-                return None
             input_schema = None
             force_input_schema = None
 
@@ -335,12 +339,6 @@ def get_quant_method(
         elif isinstance(layer, LinearBase):
             layer_type = "linear"
 
-        # TODO: remove this after humming moe backend is ready
-        quant_method = self.full_config.get("quant_method", None)
-        moe_activation = getattr(layer, "activation", None)
-        if quant_method == "mxfp4" and moe_activation == MoEActivation.SWIGLUOAI:
-            self.full_config["quan_method"] = "gpt_oss_mxfp4"
-
         quant_config = self.get_quant_config_for_layer(prefix, layer_type)
         if quant_config is None:
             if isinstance(layer, FusedMoE):
@@ -760,62 +758,18 @@ def create_weights(
         layer.register_buffer("locks", locks)
 
     def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
-        self.process_weights_after_loading(layer)
-
-        input_schema = self.input_schemas["w13"]
-        weight_schema = self.weight_schemas["w13"]
-
-        a_dtype = input_schema.a_dtype
-        if a_dtype is None or a_dtype.num_bits == 16:
-            a_quant_desc = FusedMoEQuantDesc(dtype=None)
-        else:
-            shape = GroupShape(row=1, col=-1)
-            a_quant_desc = FusedMoEQuantDesc(dtype=str(a_dtype), shape=shape)
-
-        weight_scale_group_size = weight_schema.weight_scale_group_size
-        weight_scale_group_size_n = weight_schema.weight_scale_group_size_n
-        weight_group_shape: tuple[int, ...] = ()
-        if weight_scale_group_size_n > 1:
-            weight_group_shape = GroupShape(
-                row=weight_scale_group_size,
-                col=weight_scale_group_size_n,
-            )
-        elif weight_scale_group_size == 0:
-            weight_group_shape = GroupShape(row=-1, col=1)
-        else:
-            weight_group_shape = GroupShape(row=weight_scale_group_size, col=1)
-
-        w1_quant_desc = FusedMoEQuantDesc(
-            dtype=str(weight_schema.b_dtype),
-            shape=weight_group_shape,
-            scale=getattr(layer, "w13_weight_scale", None),
-            alpha_or_gscale=getattr(layer, "w13_global_scale", None),
-            zp=getattr(layer, "w13_zero_point", None),
-            bias=getattr(layer, "w13_bias", None),
-        )
-
-        w2_quant_desc = FusedMoEQuantDesc(
-            dtype=str(weight_schema.b_dtype),
-            shape=weight_group_shape,
-            scale=getattr(layer, "w2_weight_scale", None),
-            alpha_or_gscale=getattr(layer, "w2_global_scale", None),
-            zp=getattr(layer, "w2_zero_point", None),
-            bias=getattr(layer, "w2_bias", None),
+        from vllm.model_executor.layers.quantization.utils.humming_utils import (
+            get_humming_moe_quant_config,
         )
 
-        return FusedMoEQuantConfig(
-            _a1=a_quant_desc,
-            _a2=a_quant_desc,
-            _w1=w1_quant_desc,
-            _w2=w2_quant_desc,
-        )
+        return get_humming_moe_quant_config(layer)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if getattr(self, "processed", False):
             return
         self.processed = True
-        self.weight_schemas = {}
-        self.input_schemas = {}
+        layer.weight_schemas = {}
+        layer.input_schemas = {}
         for sublayer_name, configs in layer.sublayer_configs.items():
             input_schema = self.input_schema
             weight_schema = self.weight_schema
@@ -858,8 +812,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     param = torch.nn.Parameter(tensor, requires_grad=False)
                     setattr(layer, name, param)
 
-                self.weight_schemas[sublayer_name] = weight_schema
-                self.input_schemas[sublayer_name] = input_schema
+                layer.weight_schemas[sublayer_name] = weight_schema
+                layer.input_schemas[sublayer_name] = input_schema
 
             # force requant (origin quant setting -> fp16/bf16 -> new_quant setting)
             assert isinstance(weight_schema, HummingWeightSchema)
@@ -913,10 +867,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         # use moe modular
         experts: HummingIndexedExperts | HummingGroupedExperts
+        assert self.moe_quant_config is not None
         if get_humming_moe_gemm_type() == "indexed":
-            experts = HummingIndexedExperts(layer, self)
+            experts = HummingIndexedExperts(layer, self.moe, self.moe_quant_config)
         else:
-            experts = HummingGroupedExperts(layer, self)
+            experts = HummingGroupedExperts(layer, self.moe, self.moe_quant_config)
         self.experts = experts
 
     def select_gemm_impl(
@@ -927,12 +882,19 @@ def select_gemm_impl(
         from vllm.model_executor.layers.fused_moe import modular_kernel as mk
 
         activation_format = prepare_finalize.activation_format
+        assert self.moe_quant_config is not None
         if activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
-            return BatchedHummingGroupedExperts(layer, self, prepare_finalize)
+            return BatchedHummingGroupedExperts(
+                layer=layer,
+                moe_config=self.moe,
+                quant_config=self.moe_quant_config,
+                max_num_tokens=prepare_finalize.max_num_tokens_per_rank(),
+                num_dispatchers=prepare_finalize.num_dispatchers(),
+            )
         elif get_humming_moe_gemm_type() == "indexed":
-            return HummingIndexedExperts(layer, self, prepare_finalize)
+            return HummingIndexedExperts(layer, self.moe, self.moe_quant_config)
         else:
-            return HummingGroupedExperts(layer, self, prepare_finalize)
+            return HummingGroupedExperts(layer, self.moe, self.moe_quant_config)
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 0a516831c4ec..d6fef0b3d3d5 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -24,7 +24,7 @@
     make_mxfp4_moe_kernel,
     make_mxfp4_moe_quant_config,
     mxfp4_round_up_hidden_size_and_intermediate_size,
-    select_gpt_oss_mxfp4_moe_backend,
+    select_deepseek_v4_mxfp4_moe_backend,
     select_mxfp4_moe_backend,
 )
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -140,7 +140,7 @@ class GptOssMxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.weight_dtype = "gpt_oss_mxfp4"
-        self.mxfp4_backend, self.experts_cls = select_gpt_oss_mxfp4_moe_backend(moe)
+        self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
 
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -366,6 +366,7 @@ def _setup_kernel(
                 experts_cls=self.experts_cls,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
                 shared_experts=layer.shared_experts,
+                layer=layer,
             )
 
     def process_weights_after_loading(self, layer):
@@ -404,6 +405,7 @@ def get_fused_moe_quant_config(
             gemm1_alpha=1.702,
             gemm1_beta=1.0,
             swiglu_limit=7.0,
+            layer=layer,
         )
 
     def select_gemm_impl(
@@ -466,7 +468,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.weight_dtype = "mxfp4"
-        self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
+        self.mxfp4_backend, self.experts_cls = select_deepseek_v4_mxfp4_moe_backend(moe)
 
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -692,6 +694,7 @@ def _setup_kernel(
                 experts_cls=self.experts_cls,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
                 shared_experts=layer.shared_experts,
+                layer=layer,
             )
 
     def process_weights_after_loading(self, layer):
@@ -729,6 +732,7 @@ def get_fused_moe_quant_config(
             w1_bias=w1_bias,
             w2_bias=w2_bias,
             swiglu_limit=swiglu_limit,
+            layer=layer,
         )
 
     def select_gemm_impl(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 1eeca142343b..a14bfbc9c19b 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -35,19 +35,19 @@
     make_mxfp4_moe_kernel,
     make_mxfp4_moe_quant_config,
     mxfp4_round_up_hidden_size_and_intermediate_size,
-    select_gpt_oss_mxfp4_moe_backend,
+    select_mxfp4_moe_backend,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_fp8_moe_layer_for_marlin,
 )
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    _swizzle_mxfp4,
-)
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_BLOCK_SIZE,
     OCP_MX_Scheme,
 )
-from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    kFp8StaticTensorSym,
+)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d,
     normalize_e4m3fn_to_e4m3fnuz,
@@ -62,7 +62,6 @@
 __all__ = [
     "QuarkMoEMethod",
     "QuarkOCP_MX_MoEMethod",
-    "QuarkOCP_MX_MoEMethod_OSS",
 ]
 
 
@@ -94,22 +93,9 @@ def get_moe_method(
         elif quant_config._is_fp8_w8a8(weight_config, input_config):
             return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_w_ocp_mx_a_x(weight_config, input_config):
-            emulate = not current_platform.supports_mx() or not (
-                rocm_aiter_ops.is_fused_moe_enabled()
-            )
-            if (
-                input_config is not None
-                and input_config.get("dtype") == "fp8_e4m3"
-                and not input_config.get("is_dynamic")
-                and not emulate
-            ):
-                return QuarkOCP_MX_MoEMethod_OSS(
-                    weight_config, input_config, module.moe_config
-                )
-            else:
-                return QuarkOCP_MX_MoEMethod(
-                    weight_config, input_config, module.moe_config
-                )
+            # All OCP MX schemes (W4A16, W4A8, etc.) handled by QuarkOCP_MX_MoEMethod
+            # Backend selection happens inside via oracle
+            return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config)
         elif quant_config._is_static_tensor_w8a8(
             weight_config, input_config
         ) or quant_config._is_dynamic_per_token_w8a8(weight_config, input_config):
@@ -993,7 +979,7 @@ def __init__(
         self.experts_cls: type[mk.FusedMoEExperts] | None = None
         self.moe_kernel: mk.FusedMoEKernel | None = None
 
-        # Used for triton kernel precision configs
+        # Used for triton kernel precision configs (W4A8, TRITON backends)
         self.w13_precision_config = None
         self.w2_precision_config = None
 
@@ -1002,6 +988,17 @@ def __init__(
         else:
             self.static_input_scales = False
 
+        # Select backend based on OCP MX scheme
+        if self.ocp_mx_scheme == "w_mxfp4":
+            # W4A16: weight-only MXFP4
+            self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
+        elif self.ocp_mx_scheme == "w_mxfp4_a_fp8" and self.static_input_scales:
+            # W4A8: MXFP4 weights + static FP8 activations
+            self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(
+                moe, activation_key=kFp8StaticTensorSym
+            )
+
+        # Validation for unsupported schemes
         if any(
             self.ocp_mx_scheme.endswith(a_scheme)
             for a_scheme in ["a_mxfp4", "a_mxfp6_e3m2", "a_mxfp6_e2m3"]
@@ -1026,7 +1023,7 @@ def __init__(
         )
 
         # TODO: Remove once all OCP MX schemes use the kernel abstraction
-        _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4", "w_mxfp4_a_mxfp4")
+        _AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4", "w_mxfp4_a_mxfp4", "w_mxfp4_a_fp8")
         self.emulate = (
             not current_platform.supports_mx()
             or self.ocp_mx_scheme not in _AITER_NATIVE_OCP_MX_SCHEMES
@@ -1034,9 +1031,6 @@ def __init__(
             self.mxfp4_backend is Mxfp4MoeBackend.NONE or not self.use_rocm_aiter_moe
         )
 
-        if self.ocp_mx_scheme == "w_mxfp4":
-            self.mxfp4_backend, self.experts_cls = select_gpt_oss_mxfp4_moe_backend(moe)
-
         if self.emulate:
             # We use the same code path between MXFP4/MXFP6 emulation.
             self.mxfp4_backend = Mxfp4MoeBackend.EMULATION
@@ -1046,7 +1040,12 @@ def __init__(
         if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
             self.experts_cls = backend_to_kernel_cls(self.mxfp4_backend)[0]
 
-        if self.emulate:
+        # Log backend selection
+        if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
+            logger.info_once(
+                f"Using {self.mxfp4_backend.value} backend for {self.ocp_mx_scheme}"
+            )
+        elif self.emulate:
             logger.warning_once(
                 f"The current mode (supports_mx={current_platform.supports_mx()}, "
                 f"use_rocm_aiter_moe={self.use_rocm_aiter_moe}, "
@@ -1056,10 +1055,6 @@ def __init__(
                 "QDQ (quantize and dequantize) will be used, with the linear "
                 "layers computed in high precision."
             )
-        else:
-            logger.warning_once(
-                "The current mode supports native MoE MXFP4 computation"
-            )
 
     def maybe_roundup_sizes(
         self,
@@ -1204,6 +1199,11 @@ def create_weights(
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer):
+        # For MXFP4 schemes with native backend, use oracle
+        if self.mxfp4_backend != Mxfp4MoeBackend.NONE:
+            self._setup_kernel(layer)
+            return
+
         if self.static_input_scales and self.input_dtype == "fp8":
             # firstly, process activations if fp8 static input
             if layer.w13_input_scale is None or layer.w2_input_scale is None:
@@ -1252,14 +1252,6 @@ def process_weights_after_loading(self, layer):
                         w2_input_scale, requires_grad=False
                     )
 
-        # For w_mxfp4, use oracle functions
-        if self.emulate or (
-            self.ocp_mx_scheme == "w_mxfp4"
-            and self.mxfp4_backend != Mxfp4MoeBackend.NONE
-        ):
-            self._setup_kernel_via_oracle(layer)
-            return
-
         # TODO(bowenbao): gradually migrate to oracles.
         # Existing AITER path for w_mxfp4_a_mxfp4 and other schemes
         from aiter.utility.fp4_utils import e8m0_shuffle
@@ -1298,46 +1290,48 @@ def process_weights_after_loading(self, layer):
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         torch.accelerator.empty_cache()
 
-    def _setup_kernel_via_oracle(self, layer: FusedMoE):
-        """Setup kernel using oracle functions for w_mxfp4 scheme."""
-        w13 = layer.w13_weight
-        w2 = layer.w2_weight
-        w13_scale = layer.w13_weight_scale
-        w2_scale = layer.w2_weight_scale
+    def _setup_kernel(self, layer: FusedMoE):
+        """Setup kernel using oracle functions for MXFP4 schemes (W4A16, W4A8)."""
         w13_bias = getattr(layer, "w13_bias", None)
         w2_bias = getattr(layer, "w2_bias", None)
 
-        # Convert weights to kernel format
+        # Convert weights to kernel format (handles all backend-specific logic)
         w13, w2, w13_scale, w2_scale, w13_bias, w2_bias = (
             convert_gpt_oss_weight_to_mxfp4_moe_kernel_format(
                 mxfp4_backend=self.mxfp4_backend,
                 layer=layer,
-                w13_weight=w13,
-                w2_weight=w2,
-                w13_weight_scale=w13_scale,
-                w2_weight_scale=w2_scale,
+                w13_weight=layer.w13_weight,
+                w2_weight=layer.w2_weight,
+                w13_weight_scale=layer.w13_weight_scale,
+                w2_weight_scale=layer.w2_weight_scale,
                 w13_bias=w13_bias,
                 w2_bias=w2_bias,
             )
         )
 
-        # For TRITON backends, weights are wrapped tensors from triton_kernels
-        # that don't support .detach(). Manually assign parameters.
-        if self.mxfp4_backend not in TRITON_BACKENDS:
-            replace_parameter(layer, "w13_weight", w13)
-            replace_parameter(layer, "w2_weight", w2)
-            replace_parameter(layer, "w13_weight_scale", w13_scale)
-            replace_parameter(layer, "w2_weight_scale", w2_scale)
-        else:
+        # Handle weight/scale assignment based on backend type
+        if self.mxfp4_backend in TRITON_BACKENDS or self.mxfp4_backend in (
+            Mxfp4MoeBackend.AITER_MXFP4_FP8,
+        ):
+            # Triton-based backends: w13/w2 are triton_kernels.tensor.Tensor
+            # Store on layer for apply(), scales are PrecisionConfig
             layer.w13_weight = w13
             layer.w2_weight = w2
             self.w13_precision_config = w13_scale
             self.w2_precision_config = w2_scale
+        else:
+            # Standard backends: replace parameters
+            replace_parameter(layer, "w13_weight", w13)
+            replace_parameter(layer, "w2_weight", w2)
+            replace_parameter(layer, "w13_weight_scale", w13_scale)
+            replace_parameter(layer, "w2_weight_scale", w2_scale)
 
         if w13_bias is not None and w2_bias is not None:
             replace_parameter(layer, "w13_bias", w13_bias)
             replace_parameter(layer, "w2_bias", w2_bias)
 
+        torch.accelerator.empty_cache()
+
         # Build quant config and kernel
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         if self.moe_quant_config is not None and self.experts_cls is not None:
@@ -1353,22 +1347,26 @@ def _setup_kernel_via_oracle(self, layer: FusedMoE):
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        # For w_mxfp4 with oracle backend, use oracle function
-        if self.ocp_mx_scheme == "w_mxfp4" and self.mxfp4_backend not in (
-            Mxfp4MoeBackend.NONE,
-            Mxfp4MoeBackend.EMULATION,
-        ):
-            w1_scale = layer.w13_weight_scale
-            w2_scale = layer.w2_weight_scale
-            if self.mxfp4_backend in TRITON_BACKENDS:
+        # For oracle-based backends (W4A16, W4A8), use make_mxfp4_moe_quant_config
+        if self.mxfp4_backend not in (Mxfp4MoeBackend.NONE, Mxfp4MoeBackend.EMULATION):
+            # Determine scale source based on backend type
+            if self.mxfp4_backend in TRITON_BACKENDS or self.mxfp4_backend in (
+                Mxfp4MoeBackend.AITER_MXFP4_FP8,
+            ):
                 w1_scale = self.w13_precision_config
                 w2_scale = self.w2_precision_config
+            else:
+                w1_scale = layer.w13_weight_scale
+                w2_scale = layer.w2_weight_scale
+
             return make_mxfp4_moe_quant_config(
                 mxfp4_backend=self.mxfp4_backend,
                 w1_scale=w1_scale,
                 w2_scale=w2_scale,
                 w1_bias=getattr(layer, "w13_bias", None),
                 w2_bias=getattr(layer, "w2_bias", None),
+                a1_scale=getattr(layer, "w13_input_scale", None),
+                a2_scale=getattr(layer, "w2_input_scale", None),
             )
 
         # Emulation and other schemes
@@ -1421,7 +1419,7 @@ def apply(
         topk_ids: torch.Tensor,
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor:
-        # For oracle kernel or emulation kernel
+        # For oracle-based kernels (W4A16, W4A8) or emulation kernel
         if self.moe_kernel is not None:
             return self.moe_kernel.apply(
                 hidden_states=x,
@@ -1473,135 +1471,3 @@ def apply_monolithic(
             expert_map=layer.expert_map,
             apply_router_weight_on_input=layer.apply_router_weight_on_input,
         )
-
-
-class QuarkOCP_MX_MoEMethod_OSS(QuarkOCP_MX_MoEMethod):
-    def __init__(
-        self,
-        weight_config: dict[str, Any],
-        input_config: dict[str, Any],
-        moe: FusedMoEConfig,
-    ):
-        super().__init__(weight_config, input_config, moe)
-
-    def process_weights_after_loading(self, layer):
-        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
-
-        w13_bias = layer.w13_bias.to(torch.float32)
-        w2_bias = layer.w2_bias.to(torch.float32)
-
-        layer.w13_bias = torch.nn.Parameter(w13_bias, requires_grad=False)
-        layer.w2_bias = torch.nn.Parameter(w2_bias, requires_grad=False)
-
-        # FIXME warp need to be adjusted based on batch size
-        # only apply to batched mode
-        if self.moe.use_ep:
-            num_warps = 4 if self.moe.max_num_tokens <= 512 else 8
-        else:
-            num_warps = 8
-
-        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
-            layer.w13_weight, layer.w13_weight_scale, num_warps
-        )
-        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
-            layer.w2_weight, layer.w2_weight_scale, num_warps
-        )
-
-        self.w13_weight_triton_tensor = w13_weight
-        self.w2_weight_triton_tensor = w2_weight
-
-        # need to delete the original weights to save memory on single GPU
-        del layer.w13_weight
-        del layer.w2_weight
-        layer.w13_weight = None
-        layer.w2_weight = None
-        torch.accelerator.empty_cache()
-
-        if self.static_input_scales:
-            if layer.w13_input_scale is None or layer.w2_input_scale is None:
-                raise ValueError(
-                    "QuantConfig has static quantization, but found "
-                    "activation scales are None."
-                )
-            if not all_close_1d(layer.w13_input_scale) or not all_close_1d(
-                layer.w2_input_scale
-            ):
-                logger.warning_once(
-                    "Found input_scales that are not equal for "
-                    "fp8 MoE layer. Using the maximum across experts "
-                    "for each layer."
-                )
-
-            layer.w13_input_scale = torch.nn.Parameter(
-                layer.w13_input_scale.max().to(torch.float32), requires_grad=False
-            )
-            layer.w2_input_scale = torch.nn.Parameter(
-                layer.w2_input_scale.max().to(torch.float32), requires_grad=False
-            )
-
-            from triton_kernels.numerics import InFlexData
-
-            lhs_data13 = InFlexData(scale=layer.w13_input_scale)
-            lhs_data2 = InFlexData(scale=layer.w2_input_scale)
-
-            self.w13_precision_config = PrecisionConfig(
-                weight_scale=w13_scale,
-                flex_ctx=FlexCtx(rhs_data=w13_flex, lhs_data=lhs_data13),
-            )
-
-            self.w2_precision_config = PrecisionConfig(
-                weight_scale=w2_scale,
-                flex_ctx=FlexCtx(rhs_data=w2_flex, lhs_data=lhs_data2),
-            )
-
-    def get_fused_moe_quant_config(
-        self, layer: torch.nn.Module
-    ) -> FusedMoEQuantConfig | None:
-        return mxfp4_w4a8_moe_quant_config(
-            w1_scale=self.w13_precision_config,
-            w2_scale=self.w2_precision_config,
-            a1_scale=layer.w13_input_scale,
-            a2_scale=layer.w2_input_scale,
-            w1_bias=layer.w13_bias,
-            w2_bias=layer.w2_bias,
-            block_shape=None,
-        )
-
-    @property
-    def is_monolithic(self) -> bool:
-        return True
-
-    def apply_monolithic(
-        self,
-        layer: FusedMoE,
-        x: torch.Tensor,
-        router_logits: torch.Tensor,
-        input_ids: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        if layer.enable_eplb:
-            raise NotImplementedError(
-                f"EPLB not supported for {self.__class__.__name__} yet."
-            )
-
-        from vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe import (  # noqa: E501
-            triton_kernel_moe_forward,
-        )
-
-        assert self.moe.hidden_dim_unpadded is not None
-        assert self.moe.intermediate_size_per_partition_unpadded is not None
-        return triton_kernel_moe_forward(
-            hidden_states=x,
-            w1=self.w13_weight_triton_tensor,
-            w2=self.w2_weight_triton_tensor,
-            gating_output=router_logits,
-            topk=layer.top_k,
-            renormalize=layer.renormalize,
-            global_num_experts=layer.global_num_experts,
-            expert_map=layer.expert_map,
-            quant_config=self.moe_quant_config,
-            apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            unpadded_N_w1=self.moe.intermediate_size_per_partition_unpadded * 2,
-            unpadded_K_w1=self.moe.hidden_dim_unpadded,
-            unpadded_N_w2=self.moe.hidden_dim_unpadded,
-            unpadded_K_w2=self.moe.intermediate_size_per_partition_unpadded,
-        )
diff --git a/vllm/model_executor/layers/quantization/turboquant/config.py b/vllm/model_executor/layers/quantization/turboquant/config.py
index f9cfc89c0c1d..50beb8d1d9bf 100644
--- a/vllm/model_executor/layers/quantization/turboquant/config.py
+++ b/vllm/model_executor/layers/quantization/turboquant/config.py
@@ -2,8 +2,17 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """TurboQuant configuration."""
 
+from __future__ import annotations
+
+import logging
 import math
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+logger = logging.getLogger(__name__)
 
 # Named TQ presets: each maps to frozen config parameters.
 # key_quant_bits: 8 = FP8 keys, 3-4 = MSE (Lloyd-Max) quantized keys.
@@ -159,12 +168,34 @@ def slot_size_aligned(self) -> int:
         return s + (s % 2)  # round up to even
 
     @staticmethod
-    def get_boundary_skip_layers(num_layers: int, n: int = 2) -> list[str]:
-        """Get layer indices to skip TQ compression (boundary protection).
-
-        Returns first N and last N layer indices as strings, suitable for
-        kv_cache_dtype_skip_layers.
+    def get_boundary_skip_layers(
+        model_config: ModelConfig,
+        n: int = 2,
+    ) -> list[str]:
+        """Layer indices to skip TQ compression (boundary protection).
+
+        For hybrid models (attention + Mamba/linear-attention), boundary
+        protection is disabled — hybrids typically have only 8-12
+        full-attention layers and a hard n=2 on each side would cover
+        ~40 % of them.  The dense GSM8K baselines that motivate n=2
+        don't apply to hybrids.
+
+        For dense models, skips first N and last N attention layers.
+        Empirically required for aggressive presets (k3v4_nc, 3bit_nc)
+        — without it GSM8K drops ~30 points on Qwen3-4B.
         """
+        if model_config.is_hybrid:
+            attn_indices = _get_full_attention_layer_indices(model_config)
+            if not attn_indices:
+                raise NotImplementedError(
+                    "TurboQuant KV cache requires identifiable "
+                    "full-attention layers, but none were found in "
+                    "the hybrid model config."
+                )
+            logger.info("TQ hybrid: full-attention layers %s", attn_indices)
+            return []
+
+        num_layers = model_config.hf_text_config.num_hidden_layers
         if n <= 0 or num_layers <= 0:
             return []
         n = min(n, num_layers // 2)  # don't skip more than half
@@ -175,7 +206,7 @@ def get_boundary_skip_layers(num_layers: int, n: int = 2) -> list[str]:
         return [str(i) for i in indices]
 
     @staticmethod
-    def from_cache_dtype(cache_dtype: str, head_dim: int) -> "TurboQuantConfig":
+    def from_cache_dtype(cache_dtype: str, head_dim: int) -> TurboQuantConfig:
         """Create config from a named preset.
 
         Valid presets: turboquant_k8v4, turboquant_4bit_nc, etc.
@@ -193,3 +224,31 @@ def from_cache_dtype(cache_dtype: str, head_dim: int) -> "TurboQuantConfig":
             value_quant_bits=preset["value_quant_bits"],
             norm_correction=preset["norm_correction"],
         )
+
+
+def _get_full_attention_layer_indices(model_config: ModelConfig) -> list[int]:
+    """Global indices of full-attention layers in a hybrid model.
+
+    Covers the conventions used across vLLM: ``layer_types`` (Qwen3.5/Next),
+    ``layers_block_type`` (Jamba/Zamba2), ``attn_type_list`` (Minimax).
+    """
+    text_cfg = model_config.hf_text_config
+    hf_cfg = model_config.hf_config
+
+    layer_types = getattr(text_cfg, "layer_types", None)
+    if layer_types is not None:
+        return [
+            i for i, t in enumerate(layer_types) if t in ("full_attention", "attention")
+        ]
+
+    layers_block_type = getattr(text_cfg, "layers_block_type", None)
+    if layers_block_type is not None:
+        return [
+            i for i, t in enumerate(layers_block_type) if t in ("attention", "hybrid")
+        ]
+
+    attn_type_list = getattr(hf_cfg, "attn_type_list", None)
+    if attn_type_list is not None:
+        return [i for i, t in enumerate(attn_type_list) if t == 1]
+
+    return []
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 9613b11d35e2..d9aab35c25f4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -843,6 +843,15 @@ def w8a8_triton_block_scaled_mm(
     assert len(block_size) == 2
     block_n, block_k = block_size[0], block_size[1]
 
+    # Triton cannot currently bind E8M0 scale tensors directly. On ROCm,
+    # DeepSeek-V4 checkpoints store block scales in exponent-only E8M0 format,
+    # so decode them to fp32 before launching the kernel.
+    if current_platform.is_rocm():
+        if As.dtype == torch.float8_e8m0fnu:
+            As = _upcast_e8m0_to_fp32(As).contiguous()
+        if Bs.dtype == torch.float8_e8m0fnu:
+            Bs = _upcast_e8m0_to_fp32(Bs).contiguous()
+
     assert A.shape[-1] == B.shape[-1]
     assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
     assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
diff --git a/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py b/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py
deleted file mode 100644
index 82788a0e76e8..000000000000
--- a/vllm/model_executor/layers/quantization/utils/humming_moe_utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-
-from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size,
-)
-
-
-def humming_moe_align(
-    configs: list[int],
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    expert_map: torch.Tensor | None = None,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    assert len(configs) > 0 and len(configs) % 3 == 0
-    # NOTE: we choose moe_block_size based on
-    #       num_tokens * top_k (= topk_ids.nelement())
-    shape_m = topk_ids.nelement()
-
-    for i in range(len(configs) // 3):
-        if shape_m > configs[i * 3] and shape_m <= configs[i * 3 + 1]:
-            block_size = configs[i * 3 + 2]
-            break
-    else:
-        raise ValueError(f"Could not find a matching block_size for shape_m={shape_m}")
-
-    return moe_align_block_size(
-        topk_ids=topk_ids,
-        block_size=block_size,
-        num_experts=num_experts,
-        expert_map=expert_map,
-        pad_sorted_ids=False,
-        ignore_invalid_experts=True,
-    )
diff --git a/vllm/model_executor/layers/quantization/utils/humming_utils.py b/vllm/model_executor/layers/quantization/utils/humming_utils.py
new file mode 100644
index 000000000000..f8c10bdcae16
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/humming_utils.py
@@ -0,0 +1,214 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import regex as re
+import torch
+from humming.layer import HummingInputSchema, HummingMethod
+from humming.schema import BaseWeightSchema
+
+from vllm import envs
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    FusedMoEQuantDesc,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
+
+
+def humming_is_layer_skipped(config: dict[str, Any], prefix: str):
+    if not config:
+        return True
+
+    keys = ["ignored_layers", "ignore", "modules_to_not_convert"]
+    ignored_layers: list[str] = []
+    for key in keys:
+        ignored_layers = config.get(key, []) or []
+        if not ignored_layers:
+            break
+
+    if any(module_name in prefix for module_name in ignored_layers):
+        return True
+    if "lm_head" in prefix:
+        return True
+
+    for regex in config.get("dynamic", {}):
+        if regex[:1] != "-":
+            continue
+        if re.match(regex[2:], prefix):
+            return True
+
+    return False
+
+
+def prepare_humming_layer(layer: LinearBase, quant_config: dict):
+    weight_schema = BaseWeightSchema.from_config(quant_config)
+    input_schema = HummingInputSchema()
+
+    shape_k_stacks = [layer.input_size_per_partition]
+    shape_n_stacks = layer.output_partition_sizes
+
+    # Step 1: convert weight to humming standard format
+    weight_schema, tensors = weight_schema.convert_humming(
+        tensors=layer.named_parameters(),
+        shape_n_stacks=shape_n_stacks,
+        shape_k_stacks=shape_k_stacks,
+        param_dtype=layer.params_dtype,
+    )
+
+    layer.weight_schema = weight_schema
+
+    for name, _ in list(layer.named_parameters()):
+        delattr(layer, name)
+
+    for name, tensor in tensors.items():
+        param = torch.nn.Parameter(tensor, requires_grad=False)
+        setattr(layer, name, param)
+
+    # Step 2: transform weight (humming standard format) for forwarding
+    HummingMethod.prepare_layer_meta(
+        layer=layer,
+        shape_n=layer.output_partition_sizes_sum,
+        shape_k=layer.input_size_per_partition,
+        weight_schema=weight_schema,
+        input_schema=input_schema,
+        pad_n_to_multiple=256,
+        pad_k_to_multiple=128,
+        has_bias=layer.has_bias,
+        torch_dtype=layer.param_dtype,
+    )
+
+    HummingMethod.transform_humming_layer(layer)
+
+
+def prepare_humming_moe_layer(layer: FusedMoE, quant_config: dict):
+    weight_schema = BaseWeightSchema.from_config(quant_config)
+    input_quant_config = envs.VLLM_HUMMING_INPUT_QUANT_CONFIG or {}
+    if humming_is_layer_skipped(input_quant_config, layer.layer_name):
+        input_schema = HummingInputSchema()
+    else:
+        # TODO: read input_quant_config from quant_config
+        input_schema = HummingInputSchema.from_config(input_quant_config)
+
+    is_gated = layer.activation.is_gated
+    shape_config = {
+        "w13": (
+            layer.moe_config.intermediate_size_per_partition * 2,
+            layer.moe_config.hidden_dim,
+        ),
+        "w2": (
+            layer.moe_config.hidden_dim,
+            layer.moe_config.intermediate_size_per_partition * (1 if is_gated else 2),
+        ),
+    }
+
+    layer.weight_schemas = {}
+    layer.input_schemas = {}
+
+    for sublayer_name in shape_config:
+        # Step 1: convert weight to humming standard format
+        tensors: dict[str, torch.Tensor] = dict(
+            (key.removeprefix(sublayer_name + "_"), value)
+            for key, value in layer.state_dict().items()
+            if key.startswith(sublayer_name + "_")
+        )
+
+        shape_n, shape_k = shape_config[sublayer_name]
+        shape_n_stacks = [shape_n]
+        shape_k_stacks = [shape_k]
+        if sublayer_name == "w13":
+            shape_n_stacks = [shape_n // 2] * 2
+
+        weight_schema_new, tensors = weight_schema.convert_humming(
+            tensors=tensors,
+            shape_n_stacks=shape_n_stacks,
+            shape_k_stacks=shape_k_stacks,
+            num_experts=layer.local_num_experts,
+            param_dtype=layer.params_dtype,
+        )
+
+        layer.weight_schemas[sublayer_name] = weight_schema_new
+        layer.input_schemas[sublayer_name] = input_schema
+
+        for name, _ in list(layer.named_parameters()):
+            if not name.startswith(sublayer_name + "_"):
+                continue
+            delattr(layer, name)
+
+        for name, tensor in tensors.items():
+            name = f"{sublayer_name}_{name}"
+            param = torch.nn.Parameter(tensor, requires_grad=False)
+            setattr(layer, name, param)
+
+        # Step 2: transform weight (humming standard format) for forwarding
+        HummingMethod.prepare_layer_meta(
+            layer=layer,
+            shape_n=shape_n,
+            shape_k=shape_k,
+            pad_n_to_multiple=256,
+            pad_k_to_multiple=128,
+            input_schema=input_schema,
+            weight_schema=weight_schema_new,
+            has_bias=layer.moe_config.has_bias,
+            num_experts=layer.num_experts,
+            torch_dtype=layer.params_dtype,
+            sublayer_name=sublayer_name,
+        )
+
+        HummingMethod.transform_humming_layer(layer, sublayer_name=sublayer_name)
+
+    if not hasattr(layer, "locks"):
+        device = layer.w13_weight.device
+        locks = torch.zeros(1024, dtype=torch.int32, device=device)
+        layer.register_buffer("locks", locks)
+
+
+def get_humming_moe_quant_config(layer: FusedMoE):
+    input_schema = layer.input_schemas["w13"]
+    weight_schema = layer.weight_schemas["w13"]
+
+    a_dtype = input_schema.a_dtype
+    if a_dtype is None or a_dtype.num_bits == 16:
+        a_quant_desc = FusedMoEQuantDesc(dtype=None)
+    else:
+        shape = GroupShape(row=1, col=-1)
+        a_quant_desc = FusedMoEQuantDesc(dtype=str(a_dtype), shape=shape)
+
+    weight_scale_group_size = weight_schema.weight_scale_group_size
+    weight_scale_group_size_n = weight_schema.weight_scale_group_size_n
+    weight_group_shape: tuple[int, ...] = ()
+    if weight_scale_group_size_n > 1:
+        weight_group_shape = GroupShape(
+            row=weight_scale_group_size,
+            col=weight_scale_group_size_n,
+        )
+    elif weight_scale_group_size == 0:
+        weight_group_shape = GroupShape(row=-1, col=1)
+    else:
+        weight_group_shape = GroupShape(row=weight_scale_group_size, col=1)
+
+    w1_quant_desc = FusedMoEQuantDesc(
+        dtype=str(weight_schema.b_dtype),
+        shape=weight_group_shape,
+        scale=getattr(layer, "w13_weight_scale", None),
+        alpha_or_gscale=getattr(layer, "w13_global_scale", None),
+        zp=getattr(layer, "w13_zero_point", None),
+        bias=getattr(layer, "w13_bias", None),
+    )
+
+    w2_quant_desc = FusedMoEQuantDesc(
+        dtype=str(weight_schema.b_dtype),
+        shape=weight_group_shape,
+        scale=getattr(layer, "w2_weight_scale", None),
+        alpha_or_gscale=getattr(layer, "w2_global_scale", None),
+        zp=getattr(layer, "w2_zero_point", None),
+        bias=getattr(layer, "w2_bias", None),
+    )
+
+    return FusedMoEQuantConfig(
+        _a1=a_quant_desc,
+        _a2=a_quant_desc,
+        _w1=w1_quant_desc,
+        _w2=w2_quant_desc,
+    )
diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py
index ca82f2feb7ef..4bf52a49c43f 100644
--- a/vllm/model_executor/layers/sparse_attn_indexer.py
+++ b/vllm/model_executor/layers/sparse_attn_indexer.py
@@ -499,13 +499,31 @@ def forward_hip(
         k: torch.Tensor,
         weights: torch.Tensor,
     ):
-        assert not self.skip_k_cache_insert, (
-            "AMD platform doesn't support skip cache insert yet"
-        )
         assert not self.use_fp4_cache, "AMD platform doesn't support fp4 cache yet"
         assert isinstance(q_quant, torch.Tensor), (
             "AMD sparse_attn_indexer expects a single FP8 q_quant tensor"
         )
+        if self.skip_k_cache_insert or not rocm_aiter_ops.is_enabled():
+            from vllm.v1.attention.ops.rocm_aiter_mla_sparse import (
+                rocm_aiter_sparse_attn_indexer_native,
+            )
+
+            return rocm_aiter_sparse_attn_indexer_native(
+                hidden_states,
+                _encode_layer_name(self.k_cache.prefix),
+                self.k_cache.kv_cache,
+                q_quant,
+                k,
+                weights,
+                self.quant_block_size,
+                self.scale_fmt,
+                self.topk_tokens,
+                self.head_dim,
+                self.max_model_len,
+                self.max_total_seq_len,
+                self.topk_indices_buffer,
+                skip_k_cache_insert=self.skip_k_cache_insert,
+            )
         if rocm_aiter_ops.is_enabled():
             return torch.ops.vllm.rocm_aiter_sparse_attn_indexer(
                 hidden_states,
@@ -522,8 +540,4 @@ def forward_hip(
                 self.max_total_seq_len,
                 self.topk_indices_buffer,
             )
-        else:
-            raise RuntimeError(
-                "Sparse attention indexer ROCm custom op requires ROCm "
-                "Aiter ops to be enabled."
-            )
+        raise RuntimeError("Sparse attention indexer ROCm path could not be selected.")
diff --git a/vllm/model_executor/model_loader/reload/meta.py b/vllm/model_executor/model_loader/reload/meta.py
index 91fce6f57b3e..baa2081d58b2 100644
--- a/vllm/model_executor/model_loader/reload/meta.py
+++ b/vllm/model_executor/model_loader/reload/meta.py
@@ -102,7 +102,7 @@ def materialize_layer(layer: torch.nn.Module, info: LayerReloadingInfo):
 
     with info.restore_device:
         for name, tensor in get_layer_tensors(layer).items():
-            if name not in SKIP_TENSORS:
+            if name not in SKIP_TENSORS and tensor.is_meta:
                 setattr(layer, name, materialize_meta_tensor(tensor))
 
 
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 3e6ed248ff3a..37d37d55f543 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -211,7 +211,7 @@ class TensorizerConfig(MutableMapping):
         encryption_keyfile: File path to a binary file containing a  
             binary key to use for decryption. `None` (the default) means 
             no decryption. See the example script in 
-            examples/others/tensorize_vllm_model.py. 
+            examples/features/tensorize_vllm_model.py. 
         s3_access_key_id: The access key for the S3 bucket. Can also be set via
             the S3_ACCESS_KEY_ID environment variable.
         s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -579,7 +579,7 @@ def tensorizer_weights_iterator(
         "loading on vLLM, as tensorizer is forced to load to CPU. "
         "Consider deserializing a vLLM model instead for faster "
         "load times. See the "
-        "examples/others/tensorize_vllm_model.py example script "
+        "examples/features/tensorize_vllm_model.py example script "
         "for serializing vLLM models."
     )
 
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index c5bff1312932..338f9eac072a 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -73,7 +73,7 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/others/tensorize_vllm_model.py) This should still
+        examples/features/tensorize_vllm_model.py) This should still
         be faster than default HuggingFace loading, but will be slower than
         loading a vLLM-tensorized model.
         """
@@ -104,7 +104,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
         """Load serialized model weights with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/others/tensorize_vllm_model.py example script
+        examples/features/tensorize_vllm_model.py example script
         for serializing vLLM models."""
         if is_vllm_tensorized(self.tensorizer_config):
             tensorizer_config = self._patch_tensorizer_config(model_config)
diff --git a/vllm/model_executor/models/AXK1.py b/vllm/model_executor/models/AXK1.py
index c8f56ca97fc0..701ec67c855c 100644
--- a/vllm/model_executor/models/AXK1.py
+++ b/vllm/model_executor/models/AXK1.py
@@ -77,6 +77,7 @@
 
 from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
 from .utils import (
+    AutoWeightsLoader,
     PPMissingLayer,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -727,6 +728,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size
         )
+        self.use_mha = all(
+            dim == 0 for dim in (config.qk_nope_head_dim, config.qk_rope_head_dim)
+        )
+        self.fuse_qkv_a_proj = config.q_lora_rank is not None
+        self.num_redundant_experts = (
+            vllm_config.parallel_config.eplb_config.num_redundant_experts
+        )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.embed_tokens(input_ids)
@@ -776,158 +784,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class AXK1MixtureOfExperts(MixtureOfExperts):
-    moe_mlp_layers: list[AXK1MoE]
-    """
-    List of MoE MLP layers in the model.
-    """
-
-    def extract_moe_parameters(self, example_moe: AXK1MoE | None):
-        if example_moe is None:
-            self.num_moe_layers = 0
-            self.num_expert_groups = 0
-            self.num_logical_experts = 0
-            self.num_physical_experts = 0
-            self.num_local_physical_experts = 0
-            self.num_routed_experts = 0
-            self.num_shared_experts = 0
-            self.num_redundant_experts = 0
-            logger.warning("AXK1: No AXK1MoE layer found in model.layers.")
-        else:
-            self.num_logical_experts = example_moe.n_logical_experts
-            self.num_physical_experts = example_moe.n_physical_experts
-            self.num_local_physical_experts = example_moe.n_local_physical_experts
-            self.num_routed_experts = example_moe.n_routed_experts
-            self.num_shared_experts = example_moe.n_shared_experts
-            self.num_redundant_experts = example_moe.n_redundant_experts
-
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for moe in self.moe_mlp_layers:
-            moe.n_local_physical_experts = num_local_physical_experts
-            moe.n_physical_experts = num_physical_experts
-            moe.n_redundant_experts = self.num_redundant_experts
-            moe.experts.update_expert_map()
-
-
-class AXK1ForCausalLM(
-    nn.Module, SupportsPP, AXK1MixtureOfExperts, SupportsLoRA, SupportsEagle
-):
-    packed_modules_mapping = {
-        "gate_up_proj": ["gate_proj", "up_proj"],
-    }
-    model_cls = AXK1Model
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config: AXK1Config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-
-        qk_nope_head_dim = config.qk_nope_head_dim
-        qk_rope_head_dim = config.qk_rope_head_dim
-        self.use_mha = all(dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim))
-
-        if self.use_mha:
-            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
-
-        # `packed_modules_mapping` needs to be modified before
-        # initializing AXK1Model, as it is passed inplace to
-        # quantization config init and may be used to select the
-        # quant_method for relevant layers during initialization.
-        self.fuse_qkv_a_proj = config.q_lora_rank is not None
-        if self.fuse_qkv_a_proj:
-            self.packed_modules_mapping["fused_qkv_a_proj"] = [
-                "q_a_proj",
-                "kv_a_proj_with_mqa",
-            ]
-
-        self.model = self.model_cls(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-        else:
-            self.lm_head = PPMissingLayer()
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-        # Set MoE hyperparameters
-        self.num_moe_layers = (
-            self.config.num_hidden_layers - self.config.first_k_dense_replace
-        )
-        self.set_moe_parameters()
-
-    def set_moe_parameters(self):
-        self.expert_weights = []
-
-        self.num_expert_groups = getattr(self.config, "n_group", 1)
-
-        self.moe_layers = []
-        self.moe_mlp_layers = []
-        example_moe = None
-        for layer in self.model.layers:
-            if isinstance(layer, PPMissingLayer):
-                continue
-
-            assert isinstance(layer, AXK1DecoderLayer)
-            if isinstance(layer.mlp, AXK1MoE):
-                # Pick last one layer since the first ones may be dense layers.
-                example_moe = layer.mlp
-                self.moe_mlp_layers.append(layer.mlp)
-                self.moe_layers.append(layer.mlp.experts)
-
-        self.extract_moe_parameters(example_moe)
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
-    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        return fused_moe_make_expert_params_mapping(
-            self,
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
-            num_redundant_experts=0,
-        )
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         rocm_aiter_moe_shared_expert_enabled = (
             rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
@@ -1135,12 +991,170 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
+class AXK1MixtureOfExperts(MixtureOfExperts):
+    moe_mlp_layers: list[AXK1MoE]
+    """
+    List of MoE MLP layers in the model.
+    """
+
+    def extract_moe_parameters(self, example_moe: AXK1MoE | None):
+        if example_moe is None:
+            self.num_moe_layers = 0
+            self.num_expert_groups = 0
+            self.num_logical_experts = 0
+            self.num_physical_experts = 0
+            self.num_local_physical_experts = 0
+            self.num_routed_experts = 0
+            self.num_shared_experts = 0
+            self.num_redundant_experts = 0
+            logger.warning("AXK1: No AXK1MoE layer found in model.layers.")
+        else:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for moe in self.moe_mlp_layers:
+            moe.n_local_physical_experts = num_local_physical_experts
+            moe.n_physical_experts = num_physical_experts
+            moe.n_redundant_experts = self.num_redundant_experts
+            moe.experts.update_expert_map()
+
+
+class AXK1ForCausalLM(
+    nn.Module, SupportsPP, AXK1MixtureOfExperts, SupportsLoRA, SupportsEagle
+):
+    packed_modules_mapping = {
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+    model_cls = AXK1Model
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AXK1Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        qk_nope_head_dim = config.qk_nope_head_dim
+        qk_rope_head_dim = config.qk_rope_head_dim
+        self.use_mha = all(dim == 0 for dim in (qk_nope_head_dim, qk_rope_head_dim))
+
+        if self.use_mha:
+            self.packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+
+        # `packed_modules_mapping` needs to be modified before
+        # initializing AXK1Model, as it is passed inplace to
+        # quantization config init and may be used to select the
+        # quant_method for relevant layers during initialization.
+        self.fuse_qkv_a_proj = config.q_lora_rank is not None
+        if self.fuse_qkv_a_proj:
+            self.packed_modules_mapping["fused_qkv_a_proj"] = [
+                "q_a_proj",
+                "kv_a_proj_with_mqa",
+            ]
+
+        self.model = self.model_cls(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        # Set MoE hyperparameters
+        self.num_moe_layers = (
+            self.config.num_hidden_layers - self.config.first_k_dense_replace
+        )
+        self.set_moe_parameters()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.num_expert_groups = getattr(self.config, "n_group", 1)
+
+        self.moe_layers = []
+        self.moe_mlp_layers = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AXK1DecoderLayer)
+            if isinstance(layer.mlp, AXK1MoE):
+                # Pick last one layer since the first ones may be dense layers.
+                example_moe = layer.mlp
+                self.moe_mlp_layers.append(layer.mlp)
+                self.moe_layers.append(layer.mlp.experts)
+
+        self.extract_moe_parameters(example_moe)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return fused_moe_make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts,
+            num_redundant_experts=0,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+
 def get_spec_layer_idx_from_weight_name(
     config: AXK1Config, weight_name: str
 ) -> int | None:
     if config.num_nextn_predict_layers and config.num_nextn_predict_layers > 0:
         layer_idx = config.num_hidden_layers
         for i in range(config.num_nextn_predict_layers):
-            if weight_name.startswith(f"model.layers.{layer_idx + i}."):
+            if weight_name.startswith(
+                f"model.layers.{layer_idx + i}."
+            ) or weight_name.startswith(f"layers.{layer_idx + i}."):
                 return layer_idx + i
     return None
diff --git a/vllm/model_executor/models/cohere_moe.py b/vllm/model_executor/models/cohere_moe.py
index 587a18d5aeac..a059d68c9d02 100644
--- a/vllm/model_executor/models/cohere_moe.py
+++ b/vllm/model_executor/models/cohere_moe.py
@@ -37,6 +37,7 @@
 from .commandr import LayerNorm
 from .interfaces import SupportsPP, SupportsQuant
 from .utils import (
+    AutoWeightsLoader,
     extract_layer_index,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -330,6 +331,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
+        self.quant_config = quant_config
         self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -378,63 +380,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class CohereMoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
-    is_text_generation_model = True
-
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        assert getattr(config, "tie_word_embeddings", True)
-        self.unpadded_vocab_size = config.vocab_size
-        self.quant_config = quant_config
-        self.logits_scale = config.logit_scale
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size, scale=self.logits_scale
-        )
-        self.model = CohereMoeModel(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    @torch.no_grad()
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ) -> torch.Tensor | IntermediateTensors:
-        return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        return self.logits_processor(self.model.embed_tokens, hidden_states)
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
@@ -507,8 +452,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     )
                     break
                 else:
-                    if "lm_head.weight" in name:
-                        continue
                     if (
                         name.endswith(".bias") or name.endswith("_bias")
                     ) and name not in params_dict:
@@ -526,3 +469,64 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             loaded_params.add(name)
 
         return loaded_params
+
+
+class CohereMoeForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+    is_text_generation_model = True
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        assert getattr(config, "tie_word_embeddings", True)
+        self.unpadded_vocab_size = config.vocab_size
+        self.quant_config = quant_config
+        self.logits_scale = config.logit_scale
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, config.vocab_size, scale=self.logits_scale
+        )
+        self.model = CohereMoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        return self.model(input_ids, positions, intermediate_tensors, inputs_embeds)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.model.embed_tokens, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head."])
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 15913c418b05..86881376a106 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -299,6 +299,15 @@ def __init__(
         self.is_fusion_moe_shared_experts_enabled = (
             rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
         )
+        if (
+            self.is_rocm_aiter_moe_enabled
+            and self.gate.e_score_correction_bias is not None
+        ):
+            # AITER biased_grouped_topk requires the correction bias dtype to
+            # match the router logits. Keep DeepSeek's correction bias in fp32
+            # by requesting fp32 router logits for this routing path.
+            self.gate.set_out_dtype(torch.float32)
+
         if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled:
             self.shared_experts = None
         else:
@@ -338,15 +347,9 @@ def __init__(
             n_shared_experts=config.n_shared_experts
             if self.is_fusion_moe_shared_experts_enabled
             else None,
+            router_logits_dtype=self.gate.out_dtype,
         )
 
-        # Pre-cast the bias to match the gate output dtype so the
-        # conversion is not repeated on every forward pass.  All
-        # downstream references (FusedMoE, router) share the same
-        # nn.Parameter object, so mutating .data propagates everywhere.
-        # Weight loading uses copy_(), which handles the dtype conversion.
-        # Only needed on ROCm where the aiter biased_grouped_topk kernel
-        # requires the bias dtype to match the gating output dtype.
         if (
             self.is_rocm_aiter_moe_enabled
             and self.gate.e_score_correction_bias is not None
diff --git a/vllm/model_executor/models/deepseek_v4.py b/vllm/model_executor/models/deepseek_v4.py
index 0b762d50fe72..cef4038dc2e6 100644
--- a/vllm/model_executor/models/deepseek_v4.py
+++ b/vllm/model_executor/models/deepseek_v4.py
@@ -715,12 +715,15 @@ def __init__(
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.prefix = prefix
-        if vllm_config.parallel_config.enable_expert_parallel:
-            self.use_mega_moe = (
-                vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        self.use_mega_moe = (
+            vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        )
+        if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel:
+            raise NotImplementedError(
+                "DeepSeek V4 MegaMoE currently requires expert parallel. "
+                "Enable it with --enable-expert-parallel, or pick a different "
+                "moe backend."
             )
-        else:
-            self.use_mega_moe = False
 
         self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
         self.hidden_size = config.hidden_size
@@ -1223,12 +1226,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.config = config
-        if vllm_config.parallel_config.enable_expert_parallel:
-            self.use_mega_moe = (
-                vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        self.use_mega_moe = (
+            vllm_config.kernel_config.moe_backend == "deep_gemm_mega_moe"
+        )
+        if self.use_mega_moe and not vllm_config.parallel_config.enable_expert_parallel:
+            raise NotImplementedError(
+                "DeepSeek V4 MegaMoE currently requires expert parallel. "
+                "Enable it with --enable-expert-parallel, or pick a different "
+                "moe backend."
             )
-        else:
-            self.use_mega_moe = False
         self.vocab_size = config.vocab_size
         self.hc_eps = config.hc_eps
         self.hc_mult = config.hc_mult
@@ -1239,7 +1245,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # DeepseekV4MultiHeadLatentAttentionWrapper.attn_gemm_parallel_execute
         # (compressor kv_score, indexer.weights_proj, indexer.compressor
         # kv_score). fused_wqa_wkv stays on the default stream.
-        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
+        # Disable them on ROCm because of hang issues.
+        aux_stream_list = (
+            None
+            if current_platform.is_rocm()
+            else [torch.cuda.Stream() for _ in range(3)]
+        )
 
         self.device = current_platform.device_type
         # Reserved topk indices buffer for all Indexer layers to reuse.
diff --git a/vllm/model_executor/models/deepseek_v4_mtp.py b/vllm/model_executor/models/deepseek_v4_mtp.py
index a3724e5ebe80..195709c9dacf 100644
--- a/vllm/model_executor/models/deepseek_v4_mtp.py
+++ b/vllm/model_executor/models/deepseek_v4_mtp.py
@@ -167,8 +167,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
         # Three aux streams shared across all MTP layers, mirroring
-        # DeepseekV4Model.
-        aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
+        # DeepseekV4Model. ROCm runs the same work serially for now.
+        aux_stream_list = (
+            None
+            if current_platform.is_rocm()
+            else [torch.cuda.Stream() for _ in range(3)]
+        )
 
         # to map the exact layer index from weights
         self.layers = torch.nn.ModuleDict(
diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py
index 4c8edf2a01ad..31f2d6a28ddc 100644
--- a/vllm/model_executor/models/gemma4.py
+++ b/vllm/model_executor/models/gemma4.py
@@ -360,7 +360,7 @@ def routing_function(
             quant_config=quant_config,
             prefix=f"{prefix}.experts",
             custom_routing_function=routing_function,
-            activation="gelu",
+            activation="gelu_tanh",
         )
 
     def forward(self, x: torch.Tensor, router_logits: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/gemma4_mm.py b/vllm/model_executor/models/gemma4_mm.py
index cdc54609a652..73a5e701e7af 100644
--- a/vllm/model_executor/models/gemma4_mm.py
+++ b/vllm/model_executor/models/gemma4_mm.py
@@ -81,10 +81,26 @@
 logger = init_logger(__name__)
 
 # Video constants — match transformers Gemma4VideoProcessor defaults.
+_SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)
 _VIDEO_MAX_SOFT_TOKENS = 70  # soft tokens per video frame (vs 280 for images)
 _VIDEO_MAX_FRAMES = 32  # max sampled frames per video
 
 
+def _get_max_soft_tokens(
+    merged_kwargs: Mapping[str, object],
+) -> tuple[object | None, bool]:
+    """Return configured image max_soft_tokens and whether it is top-level."""
+    val = merged_kwargs.get("max_soft_tokens")
+    if val is not None:
+        return val, True
+
+    images_kwargs = merged_kwargs.get("images_kwargs")
+    if isinstance(images_kwargs, Mapping):
+        return images_kwargs.get("max_soft_tokens"), False
+
+    return None, False
+
+
 # ---------------------------------------------------------------------------
 # Input schema
 # ---------------------------------------------------------------------------
@@ -216,17 +232,29 @@ def get_mm_max_tokens_per_item(
         self, seq_len: int, mm_counts: Mapping[str, int]
     ) -> Mapping[str, int] | None:
         config = self.get_hf_config()
-        # Upper bound: the pooler outputs default_output_length slots
-        # per image (280).  After padding is stripped the actual count
-        # is ≤ this value, but vLLM needs the max for memory planning.
+        # Upper bound: the pooler outputs max_soft_tokens slots per image.
+        # After padding is stripped the actual count is ≤ this value, but
+        # vLLM needs the max for memory planning.
         tokens_per_image = config.vision_config.default_output_length
+        merged_kwargs = self.ctx.get_merged_mm_kwargs({})
+        val, _ = _get_max_soft_tokens(merged_kwargs)
+        if isinstance(val, int) and val in _SUPPORTED_SOFT_TOKENS:
+            tokens_per_image = val
         tokens: dict[str, int] = {"image": tokens_per_image}
         if config.audio_config is not None:
             # Audio max tokens from the processor's audio_seq_length.
             processor = self.get_hf_processor()
             tokens["audio"] = processor.audio_seq_length
         # Video: each frame ≤ 70 soft tokens + boi + eoi + ~6 ts tokens.
-        tokens["video"] = _VIDEO_MAX_FRAMES * (_VIDEO_MAX_SOFT_TOKENS + 2 + 6)
+        num_frames = _VIDEO_MAX_FRAMES
+        mm_config = self.ctx.model_config.get_multimodal_config()
+        video_opts = mm_config.limit_per_prompt.get("video")
+        if (
+            isinstance(video_opts, VideoDummyOptions)
+            and video_opts.num_frames is not None
+        ):
+            num_frames = min(num_frames, video_opts.num_frames)
+        tokens["video"] = num_frames * (_VIDEO_MAX_SOFT_TOKENS + 2 + 6)
         return tokens
 
     def get_data_parser(self) -> MultiModalDataParser:
@@ -265,7 +293,14 @@ def _compute_num_soft_tokens(
         target_h = max(unit, int(math.floor(image_height * scale / unit)) * unit)
         target_w = max(unit, int(math.floor(image_width * scale / unit)) * unit)
         num_patches = (target_h // patch_size) * (target_w // patch_size)
-        return num_patches // (pooling_kernel_size**2)
+        # Clamp to ``max_soft_tokens``: extreme aspect ratios (e.g. 3x900)
+        # cause the floor() above to round one dim up to ``unit`` while the
+        # other scales freely, which over-shoots ``max_patches``. The HF
+        # Gemma 4 image processor caps its vision-tower output at
+        # ``max_soft_tokens``, so without this clamp the prompt-side
+        # placeholder count exceeds the encoder output and
+        # ``_merge_multimodal_embeddings`` crashes.
+        return min(num_patches // (pooling_kernel_size**2), max_soft_tokens)
 
     def get_image_repl(
         self,
@@ -485,13 +520,8 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # Validate max_soft_tokens early and exit cleanly on bad values.
-        _SUPPORTED_SOFT_TOKENS = (70, 140, 280, 560, 1120)
-
         merged_kwargs = self.info.ctx.get_merged_mm_kwargs(mm_kwargs)
-        val = merged_kwargs.get("max_soft_tokens")
-        if val is None:
-            val = merged_kwargs.get("images_kwargs", {}).get("max_soft_tokens")
+        val, is_top_level_max_soft_tokens = _get_max_soft_tokens(merged_kwargs)
 
         if val is not None and val not in _SUPPORTED_SOFT_TOKENS:
             raise ValueError(
@@ -638,7 +668,7 @@ def _call_hf_processor(
         # HF side (Gemma4ProcessorKwargs.images_kwargs) so that
         # _merge_kwargs routes max_soft_tokens into images_kwargs.
         patched_mm_kwargs = dict(mm_kwargs)
-        if val is not None:
+        if val is not None and is_top_level_max_soft_tokens:
             patched_mm_kwargs["max_soft_tokens"] = val
 
         processed_outputs = super()._call_hf_processor(
@@ -748,7 +778,12 @@ def get_replacement_image(item_idx: int):
                 merged_kwargs = self.info.ctx.get_merged_mm_kwargs(
                     hf_processor_mm_kwargs,
                 )
-                max_soft_tokens = merged_kwargs.get("max_soft_tokens")
+                val, _ = _get_max_soft_tokens(merged_kwargs)
+                max_soft_tokens = (
+                    val
+                    if isinstance(val, int) and val in _SUPPORTED_SOFT_TOKENS
+                    else None
+                )
                 return self.info.get_image_repl(
                     image_width=image_size.width,
                     image_height=image_size.height,
diff --git a/vllm/model_executor/models/gemma4_mtp.py b/vllm/model_executor/models/gemma4_mtp.py
new file mode 100644
index 000000000000..c294ffc6f9a7
--- /dev/null
+++ b/vllm/model_executor/models/gemma4_mtp.py
@@ -0,0 +1,603 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only Gemma4 MTP (Multi-Token Prediction) model.
+
+The Gemma4 assistant model is a lightweight decoder that shares KV cache
+with the target (backbone) model.  All assistant decoder layers are
+KV-shared: they only have Q projections (no K/V projections or norms),
+and read K/V from the target model's cache at runtime.
+
+Checkpoint layout (``gemma4_assistant``)::
+
+    model.embed_tokens.*          -- token embeddings
+    model.layers.{i}.*            -- decoder layers (Q-only attention + MLP)
+    model.norm.*                  -- final RMSNorm
+    pre_projection.*              -- Linear(2 * backbone_hidden_size, hidden_size)
+    post_projection.*             -- Linear(hidden_size, backbone_hidden_size)
+    lm_head.*                     -- language model head (tied to embed_tokens)
+    masked_embedding.centroids.*  -- centroid projection (when use_ordered_embeddings)
+    masked_embedding.token_ordering -- token-to-centroid mapping buffer
+"""
+
+from collections.abc import Iterable
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size,
+    tensor_model_parallel_all_gather,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.sequence import IntermediateTensors
+
+from .gemma4 import Gemma4MLP, _get_text_config
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    extract_layer_index,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+
+class Gemma4MTPMaskedEmbedder(nn.Module):
+    """Sparse logit computation via centroid-based vocabulary masking.
+
+    Instead of computing logits against the full vocabulary, projects
+    hidden states to centroid scores, selects top-K centroids, and
+    computes logits only for the ~top_k * (vocab_size / num_centroids)
+    tokens belonging to those centroids.
+    """
+
+    token_ordering: torch.Tensor
+
+    def __init__(
+        self,
+        hidden_size: int,
+        vocab_size: int,
+        num_centroids: int,
+        centroid_intermediate_top_k: int,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.num_centroids = num_centroids
+        self.centroid_intermediate_top_k = centroid_intermediate_top_k
+        self.vocab_size_per_centroid = vocab_size // num_centroids
+        self.num_selected = centroid_intermediate_top_k * self.vocab_size_per_centroid
+
+        self.centroids = nn.Linear(hidden_size, num_centroids, bias=False)
+        self.register_buffer(
+            "token_ordering",
+            torch.empty(vocab_size, dtype=torch.long),
+        )
+
+    def _select_and_score(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Centroid selection + sparse dot product.
+
+        Returns:
+            logits: (num_tokens, num_selected) sparse logits.
+            indices: (num_tokens, num_selected) corresponding vocab indices.
+        """
+        num_tokens = hidden_states.shape[0]
+        _, top_k_indices = torch.topk(
+            self.centroids(hidden_states),
+            k=self.centroid_intermediate_top_k,
+            dim=-1,
+        )
+        clusters = self.token_ordering.view(
+            self.num_centroids,
+            self.vocab_size_per_centroid,
+        )
+        selected = clusters[top_k_indices]
+        embeddings = lm_head_weight[selected.reshape(-1)].view(
+            num_tokens,
+            self.num_selected,
+            self.hidden_size,
+        )
+        logits = torch.einsum("td,tsd->ts", hidden_states, embeddings)
+        return logits, selected.view(num_tokens, -1)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> torch.Tensor:
+        """Full-vocab logits with non-selected positions masked to -inf."""
+        logits, indices = self._select_and_score(hidden_states, lm_head_weight)
+        output = torch.full(
+            (hidden_states.shape[0], self.vocab_size),
+            fill_value=torch.finfo(hidden_states.dtype).min,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        return output.scatter_(-1, indices, logits)
+
+    def get_top_tokens(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head_weight: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sparse argmax — returns vocab token IDs without full-vocab tensor."""
+        logits, indices = self._select_and_score(hidden_states, lm_head_weight)
+        return indices.gather(-1, logits.argmax(-1, keepdim=True)).squeeze(-1)
+
+
+class Gemma4MTPAttention(nn.Module):
+    """Q-only attention for Gemma4 MTP layers.
+
+    K/V come from the target model's KV cache via
+    ``kv_sharing_target_layer_name`` (set by the proposer after
+    model construction).
+    """
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        max_position_embeddings: int,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        attn_logits_soft_cap: float | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.scaling = 1.0
+
+        self.q_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=config.attention_bias,
+            quant_config=None,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=None,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        self.is_sliding = layer_type == "sliding_attention"
+        sliding_window = config.sliding_window if self.is_sliding else None
+
+        if layer_type in config.rope_parameters:
+            rope_parameters = dict(config.rope_parameters[layer_type])
+        else:
+            rope_parameters = dict(config.rope_parameters.copy())
+            if self.is_sliding:
+                rope_parameters["rope_theta"] = getattr(
+                    config, "rope_local_base_freq", 10000.0
+                )
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            max_position=max_position_embeddings,
+            rope_parameters=rope_parameters,
+            is_neox_style=True,
+        )
+
+        # kv_sharing_target_layer_name is set after model construction
+        # by Gemma4Proposer._setup_gemma4_kv_sharing().
+        self.is_kv_shared_layer = True
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            logits_soft_cap=attn_logits_soft_cap,
+            per_layer_sliding_window=sliding_window,
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        q, _ = self.q_proj(hidden_states)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+
+        q, _ = self.rotary_emb(positions, q, None)
+
+        # Attention reads K/V from the target's cache via KV sharing;
+        # these dummy tensors are never consumed but required by the API.
+        num_tokens = q.shape[0]
+        kv_dummy = torch.empty(
+            num_tokens,
+            self.num_kv_heads * self.head_dim,
+            dtype=q.dtype,
+            device=q.device,
+        )
+        attn_output = self.attn(q, kv_dummy, kv_dummy)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Gemma4MTPDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        is_full_attention = layer_type == "full_attention"
+        head_dim = (
+            getattr(config, "global_head_dim", config.head_dim)
+            if is_full_attention
+            else config.head_dim
+        )
+
+        self.self_attn = Gemma4MTPAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_logits_soft_cap=getattr(config, "attn_logit_softcapping", None),
+            prefix=f"{prefix}.self_attn",
+        )
+
+        text_config = _get_text_config(config)
+        self.mlp = Gemma4MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=text_config.intermediate_size,
+            hidden_activation=text_config.hidden_activation,
+            quant_config=None,
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_feedforward_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.register_buffer("layer_scalar", torch.ones(1))
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            **kwargs,
+        )
+
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+
+        hidden_states = self.pre_feedforward_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states * self.layer_scalar
+        return hidden_states, None
+
+
+class Gemma4MultiTokenPredictor(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        text_config = _get_text_config(config)
+        self.config = text_config
+
+        self.hidden_size = text_config.hidden_size
+        self.backbone_hidden_size = getattr(
+            config, "backbone_hidden_size", self.hidden_size
+        )
+        self.vocab_size = text_config.vocab_size
+        self.num_mtp_layers = text_config.num_hidden_layers
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        self.pre_projection = ColumnParallelLinear(
+            2 * self.backbone_hidden_size,
+            self.hidden_size,
+            bias=False,
+            gather_output=True,
+            prefix=f"{prefix}.pre_projection",
+        )
+
+        self.post_projection = RowParallelLinear(
+            self.hidden_size,
+            self.backbone_hidden_size,
+            bias=False,
+            input_is_parallel=False,
+            prefix=f"{prefix}.post_projection",
+        )
+
+        self.layers = nn.ModuleList(
+            Gemma4MTPDecoderLayer(
+                text_config,
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+                prefix=f"{prefix}.layers.{idx}",
+            )
+            for idx in range(self.num_mtp_layers)
+        )
+
+        self.norm = RMSNorm(self.hidden_size, eps=text_config.rms_norm_eps)
+
+        # After embedding sharing, embed_tokens is replaced with the
+        # target model's backbone-dim embedding.  Scale by
+        # sqrt(backbone_hidden_size) to match the target's convention.
+        self.register_buffer(
+            "normalizer",
+            torch.tensor(self.backbone_hidden_size**0.5),
+            persistent=False,
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids) * self.normalizer
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        params_dict.update(dict(self.named_buffers()))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Returns (draft_hidden_states, backbone_hidden_states).
+
+        draft_hidden_states: draft-dim, used by compute_logits via lm_head.
+        backbone_hidden_states: backbone-dim, stored in the proposer's
+            hidden-state buffer and fed back as input to the next step.
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_input_ids(input_ids)
+
+        combined = torch.cat([inputs_embeds, hidden_states], dim=-1)
+        hidden_states, _ = self.pre_projection(combined)
+
+        residual = None
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+
+        draft_hidden_states = self.norm(hidden_states)
+
+        backbone_hidden_states, _ = self.post_projection(draft_hidden_states)
+        return draft_hidden_states, backbone_hidden_states
+
+
+@support_torch_compile
+class Gemma4MTP(nn.Module):
+    """Gemma4 Multi-Token Prediction model for speculative decoding.
+
+    forward() returns (draft_hidden_states, backbone_hidden_states).
+    The proposer uses draft_hidden_states for compute_logits (via
+    the draft-dim lm_head) and backbone_hidden_states for the
+    hidden-state feedback buffer.
+    """
+
+    has_own_lm_head = True
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "pre_projection.": "model.pre_projection.",
+            "post_projection.": "model.post_projection.",
+        },
+    )
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.speculative_config.draft_model_config.hf_config
+        text_config = _get_text_config(config)
+        self.config = config
+
+        self.model = Gemma4MultiTokenPredictor(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "draft_model"),
+        )
+
+        # lm_head operates in draft-dim.  Tied to embed_tokens at init
+        # so load_weights populates both from a single checkpoint entry.
+        # After embedding sharing, lm_head.weight still references the
+        # original draft-dim tensor.
+        self.lm_head = ParallelLMHead(
+            text_config.vocab_size,
+            text_config.hidden_size,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+        if getattr(config, "tie_word_embeddings", True):
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(
+            text_config.vocab_size,
+            soft_cap=getattr(text_config, "final_logit_softcapping", None),
+        )
+
+        if getattr(config, "use_ordered_embeddings", False):
+            num_centroids = getattr(config, "num_centroids", 2048)
+            top_k = getattr(config, "centroid_intermediate_top_k", 32)
+            self.masked_embedding = Gemma4MTPMaskedEmbedder(
+                hidden_size=text_config.hidden_size,
+                vocab_size=text_config.vocab_size,
+                num_centroids=num_centroids,
+                centroid_intermediate_top_k=top_k,
+            )
+            logger.info(
+                "Gemma4 MTP: centroids masking enabled "
+                "(num_centroids=%d, top_k=%d, active_tokens=%d/%d).",
+                num_centroids,
+                top_k,
+                top_k * (text_config.vocab_size // num_centroids),
+                text_config.vocab_size,
+            )
+        else:
+            self.masked_embedding = None
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        spec_step_idx: int = 0,
+        **kwargs: object,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.model(
+            input_ids,
+            positions,
+            hidden_states,
+            intermediate_tensors,
+            inputs_embeds,
+            spec_step_idx,
+        )
+
+    def _get_full_lm_head_weight(self) -> torch.Tensor:
+        lm_head_weight = self.lm_head.weight
+        tp_size = get_tensor_model_parallel_world_size()
+        if tp_size > 1:
+            lm_head_weight = tensor_model_parallel_all_gather(
+                lm_head_weight,
+                dim=0,
+            )
+        return lm_head_weight[: self.masked_embedding.vocab_size]
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor | None:
+        if self.masked_embedding is not None:
+            return self.masked_embedding(
+                hidden_states,
+                self._get_full_lm_head_weight(),
+            )
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def get_top_tokens(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        """Sparse argmax via centroids masking. Returns token IDs directly."""
+        return self.masked_embedding.get_top_tokens(
+            hidden_states,
+            self._get_full_lm_head_weight(),
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index aeec6fefa231..98cc9a50adcc 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -506,16 +506,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
+
                 name = name.replace(weight_name, param_name)
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
                 if is_pp_missing_parameter(name, self):
                     continue
 
                 param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                if weight_loader == default_weight_loader:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
                 break
             else:
                 is_expert_weight = False
diff --git a/vllm/model_executor/models/laguna.py b/vllm/model_executor/models/laguna.py
index 08f35d691817..5bf5b7cb021e 100644
--- a/vllm/model_executor/models/laguna.py
+++ b/vllm/model_executor/models/laguna.py
@@ -39,7 +39,12 @@
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
-from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.interfaces import (
+    EagleModelMixin,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+)
 from vllm.model_executor.models.utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -554,7 +559,7 @@ def forward(
 
 
 @support_torch_compile
-class LagunaModel(nn.Module):
+class LagunaModel(nn.Module, EagleModelMixin):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
@@ -633,8 +638,17 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in islice(self.layers, self.start_layer, self.end_layer):
+        aux_hidden_states = self._maybe_add_hidden_state(
+            [], self.start_layer, hidden_states, residual
+        )
+        for layer_idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer),
+            start=self.start_layer,
+        ):
             hidden_states, residual = layer(positions, hidden_states, residual)
+            self._maybe_add_hidden_state(
+                aux_hidden_states, layer_idx + 1, hidden_states, residual
+            )
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors(
@@ -642,6 +656,8 @@ def forward(
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
     def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
@@ -821,7 +837,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class LagunaForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+class LagunaForCausalLM(nn.Module, SupportsPP, SupportsLoRA, SupportsEagle3):
     fall_back_to_pt_during_load = False
 
     packed_modules_mapping = {
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 684ced0a6abd..994b52606b18 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -1499,6 +1499,11 @@ def compute_logits(
         return self.language_model.compute_logits(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        mm_config = self.model_config.multimodal_config
+        load_multimodal_weights = not all(
+            mm_config.get_limit_per_prompt(modality) == 0
+            for modality in ("image", "video", "audio")
+        )
         adapter_dict = dict(self.mlp1.named_parameters())
 
         def is_llm(name: str) -> bool:
@@ -1523,23 +1528,30 @@ def is_sound_weights(name: str) -> bool:
                 # Strip 'language_model.' prefix for LLM weights
                 llm_weights.append((".".join(name.split(".")[1:]), w))
             elif is_adapter_weights((name, w)):
+                if not load_multimodal_weights:
+                    continue
                 # Load vision-language adapter weights directly
                 trimmed_name = ".".join(name.split(".")[1:])
                 param = adapter_dict[trimmed_name]
                 with torch.no_grad():
                     default_weight_loader(param, w)
             elif is_vision_weights(name):
+                if not load_multimodal_weights:
+                    continue
                 # Convert: vision_model.radio_model.* → radio_model.*
                 hf_key = name[len("vision_model.") :]  # Remove "vision_model." prefix
                 vision_weights.append((hf_key, w))
             elif is_sound_weights(name):
+                if not load_multimodal_weights:
+                    continue
                 assert self.sound_encoder is not None
                 sound_weights.append((name, w))
 
         self.language_model.load_weights(llm_weights)
-        self.vision_model.load_weights(vision_weights)
-        if self.sound_encoder is not None and len(sound_weights) > 0:
-            self.sound_encoder.load_weights(sound_weights)
+        if load_multimodal_weights:
+            self.vision_model.load_weights(vision_weights)
+            if self.sound_encoder is not None and len(sound_weights) > 0:
+                self.sound_encoder.load_weights(sound_weights)
 
     def get_vit_model_from_radio_config(self, hf_config):
         hf_config_vision = hf_config.vision_config
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index e81541b29aec..c6d369fffa1d 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -797,6 +797,83 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            # Update the weight names to be compatible with the vllm version
+            # of the model.
+            # Do not change the order of the replacements.
+            replacements = {
+                # Rename incompatible weight names.
+                ".A_log": ".A",
+                ".B_norm_weight": ".B_norm.weight",
+                ".C_norm_weight": ".C_norm.weight",
+                ".dt_norm_weight": ".dt_norm.weight",
+                ".q_weight": ".q_norm.weight",
+                ".k_weight": ".k_norm.weight",
+            }
+            # Apply replacements based on the defined mappings
+            for old, new in replacements.items():
+                if old in name:
+                    name = name.replace(old, new)
+
+            # Reshape the in_proj weights to match the shape expected
+            # by MergedColumnParallelLinear.
+            # This works both for unquantized weights and
+            # for quantized weights.
+            # In the quantized case, the weights are already transposed.
+            # Also, in addition to the quantized weights,
+            # the zero points and scales have to be reshaped as well.
+            # Packing should not be affected by this.
+            if (
+                ".mixer.in_proj.weight" in name
+                or "mixer.in_proj.qweight" in name
+                or "mixer.in_proj.scales" in name
+                or "mixer.in_proj.qzeros" in name
+            ):
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                # for weight:
+                # loaded_weight.shape[0] == self.config.hidden_size
+                # for qweight:
+                # loaded_weight.shape[0] == self.config.hidden_size // param.pack_factor  # noqa
+                # for scales and qzeros:
+                # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
+                loaded_weight = loaded_weight.reshape(
+                    loaded_weight.shape[0], self.config.mamba_num_heads, -1
+                )
+                gate_weight, hidden_states_weight = loaded_weight.chunk(2, dim=-1)
+                gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
+                hidden_states_weight = hidden_states_weight.reshape(
+                    loaded_weight.shape[0], -1
+                )
+                loaded_weight = torch.cat([gate_weight, hidden_states_weight], dim=-1)
+                if "mixer.in_proj.weight" in name:
+                    loaded_weight = loaded_weight.transpose(0, 1)
+
+            # Offset parameter with vllm's RMSNorm haven't been supported yet.
+            if ".pre_mixer_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mixer_norm" in name:
+                loaded_weight += 1.0 / 5
+            elif ".pre_mlp_norm" in name:
+                loaded_weight += 1.0
+            elif ".post_mlp_norm" in name:
+                loaded_weight += 1.0 / (5**1.5)
+            elif name == "norm.weight":
+                loaded_weight += 1.0
+
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Plamo2ForCausalLM(
     torch.nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid
@@ -906,88 +983,9 @@ def compute_logits(
         logits = self.logits_processor(self.lm_head, hidden_states)
         return logits
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        params_dict = dict(self.named_parameters())
-        for name, loaded_weight in weights:
-            # Both tie_word_embeddings=True and lm_head.weight in the safetensor
-            # at the same time causes dict key access error.
-            if name == "lm_head.weight" and self.config.tie_word_embeddings:
-                assert "lm_head.weight" not in params_dict
-                continue
-            # Same workaround as AutoWeightsLoader for GPTQModel
-            if any(
-                substr in name
-                for substr in AutoWeightsLoader.ROTARY_EMBEDS_UNUSED_WEIGHTS
-            ):
-                continue
-
-            # Update the weight names to be compatible with the vllm version
-            # of the model.
-            # Do not change the order of the replacements.
-            replacements = {
-                # Rename incompatible weight names.
-                ".A_log": ".A",
-                ".B_norm_weight": ".B_norm.weight",
-                ".C_norm_weight": ".C_norm.weight",
-                ".dt_norm_weight": ".dt_norm.weight",
-                ".q_weight": ".q_norm.weight",
-                ".k_weight": ".k_norm.weight",
-            }
-            # Apply replacements based on the defined mappings
-            for old, new in replacements.items():
-                if old in name:
-                    name = name.replace(old, new)
-
-            # Reshape the in_proj weights to match the shape expected
-            # by MergedColumnParallelLinear.
-            # This works both for unquantized weights and
-            # for quantized weights.
-            # In the quantized case, the weights are already transposed.
-            # Also, in addition to the quantized weights,
-            # the zero points and scales have to be reshaped as well.
-            # Packing should not be affected by this.
-            if (
-                ".mixer.in_proj.weight" in name
-                or "mixer.in_proj.qweight" in name
-                or "mixer.in_proj.scales" in name
-                or "mixer.in_proj.qzeros" in name
-            ):
-                if "mixer.in_proj.weight" in name:
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                # for weight:
-                # loaded_weight.shape[0] == self.config.hidden_size
-                # for qweight:
-                # loaded_weight.shape[0] == self.config.hidden_size // param.pack_factor  # noqa
-                # for scales and qzeros:
-                # loaded_weight.shape[0] == self.config.hidden_size // self.vllm_config.quant_config.group_size  # noqa
-                loaded_weight = loaded_weight.reshape(
-                    loaded_weight.shape[0], self.config.mamba_num_heads, -1
-                )
-                gate_weight, hidden_states_weight = loaded_weight.chunk(2, dim=-1)
-                gate_weight = gate_weight.reshape(loaded_weight.shape[0], -1)
-                hidden_states_weight = hidden_states_weight.reshape(
-                    loaded_weight.shape[0], -1
-                )
-                loaded_weight = torch.cat([gate_weight, hidden_states_weight], dim=-1)
-                if "mixer.in_proj.weight" in name:
-                    loaded_weight = loaded_weight.transpose(0, 1)
-
-            # Offset parameter with vllm's RMSNorm haven't been supported yet.
-            if ".pre_mixer_norm" in name:
-                loaded_weight += 1.0
-            elif ".post_mixer_norm" in name:
-                loaded_weight += 1.0 / 5
-            elif ".pre_mlp_norm" in name:
-                loaded_weight += 1.0
-            elif ".post_mlp_norm" in name:
-                loaded_weight += 1.0 / (5**1.5)
-            elif "model.norm.weight" in name:
-                loaded_weight += 1.0
-
-            # Skip layers on other devices.
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader", default_weight_loader)
-            weight_loader(param, loaded_weight)
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qianfan_ocr.py b/vllm/model_executor/models/qianfan_ocr.py
new file mode 100644
index 000000000000..ef2bec1e2900
--- /dev/null
+++ b/vllm/model_executor/models/qianfan_ocr.py
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# QianfanOCR is built on InternVL with a Qwen3 language backbone.
+# The model architecture and weights are fully compatible with InternVLChatModel,
+# only the config model_type / architectures strings differ.
+
+from transformers import PretrainedConfig
+
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.transformers_utils.processors.internvl import (
+    InternVLImageProcessor,
+    InternVLProcessor,
+)
+
+from .internvl import (
+    BaseInternVLDummyInputsBuilder,
+    BaseInternVLMultiModalProcessor,
+    BaseInternVLProcessingInfo,
+    InternVLChatModel,
+)
+
+
+class QianfanOCRProcessingInfo(BaseInternVLProcessingInfo):
+    """Image-only ProcessingInfo for QianfanOCR (no video support)."""
+
+    def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
+        config = self.get_hf_config()
+        vision_config = config.vision_config
+
+        kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
+        kwargs.setdefault("image_size", vision_config.image_size)
+        kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
+        kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
+        kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
+        kwargs.setdefault("use_thumbnail", config.use_thumbnail)
+
+        image_processor = InternVLImageProcessor(**kwargs)
+        image_size = image_processor.image_size
+        patch_size = vision_config.patch_size
+        downsample_ratio = config.downsample_ratio
+        image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
+
+        return InternVLProcessor(
+            tokenizer=self.get_tokenizer(),
+            image_processor=image_processor,
+            video_processor=None,
+            image_seq_length=image_seq_length,
+            ctx_video_token=None,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    BaseInternVLMultiModalProcessor,
+    info=QianfanOCRProcessingInfo,
+    dummy_inputs=BaseInternVLDummyInputsBuilder,
+)
+class QianfanOCRForConditionalGeneration(InternVLChatModel):
+    """QianfanOCR multimodal model.
+
+    Identical in structure to InternVLChatModel (InternViT vision encoder +
+    pixel-shuffle MLP connector + Qwen3 language model).  This class exists
+    solely to register the ``QianfanOCRForConditionalGeneration`` architecture
+    name that appears in the model's config.json.
+    """
+
+    def _patch_quant_config(
+        self, config: PretrainedConfig, quant_config: QuantizationConfig
+    ) -> None:
+        super()._patch_quant_config(config, quant_config)
+        # ignore vit layers to preserve model performance
+        if isinstance(quant_config, Fp8Config):
+            _FP8_IGNORED_LAYERS = [
+                *(
+                    layer
+                    for i in range(config.vision_config.num_hidden_layers)
+                    for layer in [
+                        f"vision_model.encoder.layers.{i}.attn.qkv",
+                        f"vision_model.encoder.layers.{i}.attn.proj",
+                        f"vision_model.encoder.layers.{i}.mlp.fc1",
+                        f"vision_model.encoder.layers.{i}.mlp.fc2",
+                    ]
+                ),
+                "language_model.lm_head",
+                "mlp1.1",
+                "mlp1.3",
+            ]
+            for layer in _FP8_IGNORED_LAYERS:
+                if layer not in quant_config.ignored_layers:
+                    quant_config.ignored_layers.append(layer)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index c11684b4b89b..54334c91bfa6 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -85,11 +85,13 @@
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
+from vllm.v1.worker.encoder_cudagraph_defs import EncoderCudaGraphReplayBuffers
 
 from .interfaces import (
     MultiModalEmbeddings,
     SupportsEagle,
     SupportsEagle3,
+    SupportsEncoderCudaGraph,
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
@@ -771,22 +773,54 @@ def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
         inv[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
         return inv
 
-    def forward(
+    def prepare_encoder_metadata(
         self,
-        x: torch.Tensor,
         grid_thw: list[list[int]],
-    ) -> torch.Tensor:
+        *,
+        max_batch_size: int | None = None,
+        max_frames_per_batch: int | None = None,
+        max_window_seqs_per_batch: int | None = None,
+        max_seqlen_override: int | None = None,
+        max_seqlen_window_override: int | None = None,
+        device: torch.device | None = None,
+    ) -> dict[str, torch.Tensor]:
+        """Compute encoder metadata from grid_thw.
+
+        Shared by the eager forward path, CUDA graph capture, and
+        CUDA graph replay to avoid duplicated implementation.
+
+        Args:
+            grid_thw: Grid configurations as list of [t, h, w].
+            max_batch_size: If set, pad cu_seqlens to this size
+                (needed for CUDA graph capture/replay).
+            max_frames_per_batch: If set, overrides max_batch_size for
+                cu_seqlens padding. For video inputs each item contributes
+                T attention sequences (frames); this sizes the buffer to
+                the total frame budget so video replays never overflow.
+            max_window_seqs_per_batch: If set, pad cu_window_seqlens to this
+                number of window sequences. This keeps cu_window_seqlens shape
+                stable across capture/replay for CUDA graph safety.
+            max_seqlen_override: If set, use this value for max_seqlen
+                instead of computing from cu_seqlens (needed for CUDA
+                graph capture to cover worst-case replay scenarios).
+            max_seqlen_window_override: If set, use this value for
+                window-attention max_seqlen instead of computing from
+                cu_window_seqlens (needed for CUDA graph capture to
+                cover worst-case replay scenarios).
+            device: Device to place tensors on. Defaults to self.device.
+        """
+
+        if device is None:
+            device = self.device
+        metadata: dict[str, torch.Tensor] = {}
+
         # patchify
-        seq_len, _ = x.size()
         rotary_pos_emb_cos = []
         rotary_pos_emb_sin = []
         window_index: list = []
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
 
-        hidden_states = x.to(device=self.device, dtype=self.dtype)
-        hidden_states = self.patch_embed(hidden_states)
-
         window_index_id = 0
         cu_window_seqlens_last = 0
         for t, h, w in grid_thw:
@@ -825,23 +859,99 @@ def forward(
         cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
+        # Pad cu_seqlens to the required number of sequences.
+        # For videos each item contributes T frames = T attention sequences,
+        # so the total can exceed max_batch_size. max_frames_per_batch
+        # overrides the pad target when set.
+        pad_to = (
+            max_frames_per_batch if max_frames_per_batch is not None else max_batch_size
+        )
+        if pad_to is not None:
+            num_seqs = len(cu_seqlens) - 1
+            if num_seqs < pad_to:
+                cu_seqlens = torch.cat(
+                    (
+                        cu_seqlens,
+                        torch.full(
+                            (pad_to - num_seqs,),
+                            cu_seqlens[-1],
+                            dtype=cu_seqlens.dtype,
+                            device=cu_seqlens.device,
+                        ),
+                    )
+                )
+
+        # Pad cu_window_seqlens to a stable number of window sequences.
+        # Like cu_seqlens, we repeat the last cumulative offset so padded
+        # entries represent empty sequences.
+        if max_window_seqs_per_batch is not None:
+            num_window_seqs = len(cu_window_seqlens) - 1
+            if num_window_seqs < max_window_seqs_per_batch:
+                cu_window_seqlens = torch.cat(
+                    (
+                        cu_window_seqlens,
+                        torch.full(
+                            (max_window_seqs_per_batch - num_window_seqs,),
+                            cu_window_seqlens[-1],
+                            dtype=cu_window_seqlens.dtype,
+                            device=cu_window_seqlens.device,
+                        ),
+                    )
+                )
+
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
+        if max_seqlen_override is None:
+            max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        else:
+            max_seqlen_full = torch.tensor(max_seqlen_override, dtype=torch.int32)
+        if max_seqlen_window_override is None:
+            max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
+        else:
+            max_seqlen_window = torch.tensor(
+                max_seqlen_window_override, dtype=torch.int32
+            )
 
-        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
-        cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
-        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
-            device=self.device, non_blocking=True
-        )
-        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
-            device=self.device, non_blocking=True
-        )
-        window_index = window_index.to(device=hidden_states.device, non_blocking=True)
-        reverse_indices = reverse_indices.to(
-            device=hidden_states.device, non_blocking=True
-        )
+        cu_seqlens = cu_seqlens.to(device=device, non_blocking=True)
+        cu_window_seqlens = cu_window_seqlens.to(device=device, non_blocking=True)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(device=device, non_blocking=True)
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(device=device, non_blocking=True)
+        window_index = window_index.to(device=device, non_blocking=True)
+        reverse_indices = reverse_indices.to(device=device, non_blocking=True)
+
+        metadata["rotary_pos_emb_cos"] = rotary_pos_emb_cos
+        metadata["rotary_pos_emb_sin"] = rotary_pos_emb_sin
+        metadata["window_index"] = window_index
+        metadata["reverse_indices"] = reverse_indices
+        metadata["cu_seqlens"] = cu_seqlens
+        metadata["cu_window_seqlens"] = cu_window_seqlens
+        metadata["max_seqlen_full"] = max_seqlen_full
+        metadata["max_seqlen_window"] = max_seqlen_window
+
+        return metadata
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+        *,
+        encoder_metadata: dict[str, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        seq_len = hidden_states.shape[0]
+        if encoder_metadata is None:
+            encoder_metadata = self.prepare_encoder_metadata(grid_thw)
+
+        rotary_pos_emb_cos = encoder_metadata["rotary_pos_emb_cos"]
+        rotary_pos_emb_sin = encoder_metadata["rotary_pos_emb_sin"]
+        window_index = encoder_metadata["window_index"]
+        reverse_indices = encoder_metadata["reverse_indices"]
+        cu_seqlens = encoder_metadata["cu_seqlens"]
+        cu_window_seqlens = encoder_metadata["cu_window_seqlens"]
+        max_seqlen_full = encoder_metadata["max_seqlen_full"]
+        max_seqlen_window = encoder_metadata["max_seqlen_window"]
 
         hidden_states = hidden_states.reshape(
             seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1
@@ -1003,6 +1113,7 @@ def get_replacement_qwen2vl(item_idx: int, modality: str):
 class Qwen2_5_VLForConditionalGeneration(
     nn.Module,
     SupportsMultiModal,
+    SupportsEncoderCudaGraph,
     SupportsLoRA,
     SupportsPP,
     SupportsQuant,
@@ -1124,6 +1235,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         self.config = config
+        self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.multimodal_config = multimodal_config
         self.video_pruning_rate = multimodal_config.video_pruning_rate
@@ -1447,6 +1559,302 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
                 multimodal_embeddings += tuple(video_embeddings)
         return multimodal_embeddings
 
+    # -- SupportsEncoderCudaGraph protocol methods --
+
+    def get_encoder_cudagraph_config(self):
+        from vllm.v1.worker.encoder_cudagraph_defs import (
+            EncoderCudaGraphConfig,
+        )
+
+        # NOTE: With EVS pruning enabled, multimodal embeddings are post-processed
+        # (append positions for image and prune+append positions for video) in
+        # embed_multimodal(). The encoder CUDA graph path bypasses that postprocess
+        # hook, so disable CUDA graph for all modalities to avoid inconsistent
+        # embedding formats between eager and cudagraph paths.
+        modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"]
+
+        return EncoderCudaGraphConfig(
+            modalities=modalities,
+            input_key_by_modality={
+                "image": "pixel_values",
+                "video": "pixel_values_videos",
+            },
+            buffer_keys=[
+                "rotary_pos_emb_cos",
+                "rotary_pos_emb_sin",
+                "window_index",
+                "reverse_indices",
+                "cu_seqlens",
+                "cu_window_seqlens",
+                "max_seqlen_full",
+                "max_seqlen_window",
+            ],
+            out_hidden_size=self.visual.out_hidden_size,
+        )
+
+    def get_input_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> str:
+        if "image_grid_thw" in mm_kwargs:
+            return "image"
+        return "video"
+
+    def get_max_frames_per_video(self) -> int:
+        mm_registry = MULTIMODAL_REGISTRY
+        info = mm_registry.get_processing_info(self.model_config)
+        max_frames_per_video = info.get_num_frames_with_most_features(
+            seq_len=self.model_config.max_model_len,
+            mm_counts={"video": self.multimodal_config.get_limit_per_prompt("video")},
+        )
+        return max_frames_per_video
+
+    def get_encoder_cudagraph_budget_range(
+        self,
+        vllm_config: VllmConfig,
+    ) -> tuple[int, int]:
+        # Min: estimated smallest possible encoder input.
+        # 224x224 image → 16x16 patches (patch_size=14)
+        #                 spatial_merge_size=2 → 8x8 = 64 tokens
+        min_budget = 64
+        # Max: capped by max_num_batched_tokens
+        max_budget = min(
+            vllm_config.scheduler_config.max_num_batched_tokens,
+            self.model_config.max_model_len,
+        )
+        return (min_budget, max_budget)
+
+    def _get_pixel_values_by_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
+        if self.get_input_modality(mm_kwargs) == "image":
+            pixel_values = mm_kwargs["pixel_values"]
+        else:
+            pixel_values = mm_kwargs["pixel_values_videos"]
+        return pixel_values
+
+    def _get_grid_thw_by_modality(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[tuple[int, int, int]]:
+        grid_thw_key = f"{self.get_input_modality(mm_kwargs)}_grid_thw"
+        grid_thw = mm_kwargs[grid_thw_key]
+        if not isinstance(grid_thw, list):
+            grid_thw = grid_thw.tolist()
+        return grid_thw
+
+    def get_encoder_cudagraph_num_items(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> int:
+        return len(self._get_grid_thw_by_modality(mm_kwargs))
+
+    def get_encoder_cudagraph_per_item_output_tokens(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[int]:
+        m = self.visual.spatial_merge_size
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return [t * (h // m) * (w // m) for t, h, w in grid_thw]
+
+    def get_encoder_cudagraph_per_item_input_sizes(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> list[int]:
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return [t * h * w for t, h, w in grid_thw]
+
+    def select_encoder_cudagraph_items(
+        self,
+        mm_kwargs: dict[str, Any],
+        indices: list[int],
+    ) -> dict[str, Any]:
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+
+        if len(indices) == 0:
+            if self.get_input_modality(mm_kwargs) == "image":
+                return {
+                    "pixel_values": pixel_values[:0],
+                    "image_grid_thw": [],
+                }
+            else:
+                return {
+                    "pixel_values_videos": pixel_values[:0],
+                    "video_grid_thw": [],
+                }
+
+        # Compute cumulative patch offsets for slicing pixel_values
+        patches_per_item = [t * h * w for t, h, w in grid_thw]
+        cum_patches = [0]
+        for p in patches_per_item:
+            cum_patches.append(cum_patches[-1] + p)
+
+        selected_pv = torch.cat(
+            [pixel_values[cum_patches[i] : cum_patches[i + 1]] for i in indices]
+        )
+        selected_grid = [grid_thw[i] for i in indices]
+
+        if self.get_input_modality(mm_kwargs) == "image":
+            return {
+                "pixel_values": selected_pv,
+                "image_grid_thw": selected_grid,
+            }
+        else:
+            return {
+                "pixel_values_videos": selected_pv,
+                "video_grid_thw": selected_grid,
+            }
+
+    def prepare_encoder_cudagraph_capture_inputs(
+        self,
+        token_budget: int,
+        max_batch_size: int,
+        max_frames_per_batch: int,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        from vllm.v1.worker.encoder_cudagraph_defs import (
+            EncoderCudaGraphCaptureInputs,
+        )
+
+        spatial_merge_size = self.visual.spatial_merge_size
+        max_window_seqs_per_batch = min(
+            self.vllm_config.scheduler_config.max_num_batched_tokens,
+            self.model_config.max_model_len,
+        )
+        # Use ceil here (not floor) so total captured capacity is never smaller
+        # than token_budget when token_budget is not divisible by max_batch_size
+        # (e.g., 324 budget with max_batch_size=8). Floor under-allocates
+        # input_buffer and can fail replay copy for valid single-item batches.
+        per_mm_item_output = (token_budget + max_batch_size - 1) // max_batch_size
+
+        frames_per_item = max_frames_per_batch // max_batch_size
+        if frames_per_item > 1:
+            # Build the capture grid using a video-format layout so that
+            # cu_seqlens is sized for video replays from the start.
+            # cu_seqlens has one entry per attention sequence (one per frame),
+            # so using T > 1 per item makes the buffer large enough without
+            # relying solely on padding.
+            # Ceiling ensures frames_per_item * tokens_per_frame >= per_mm_item_output
+            # so the pixel_values buffer covers any valid single-item replay.
+            tokens_per_frame = (
+                per_mm_item_output + frames_per_item - 1
+            ) // frames_per_item
+            # Video-format grid_config (T=frames_per_item).
+            grid_config = [
+                [
+                    frames_per_item,
+                    spatial_merge_size,
+                    tokens_per_frame * spatial_merge_size,
+                ]
+                for _ in range(max_batch_size)
+            ]
+        else:
+            # Image-format grid_config (T=1).
+            grid_config = [
+                [1, spatial_merge_size, per_mm_item_output * spatial_merge_size]
+                for _ in range(max_batch_size)
+            ]
+
+        # Create dummy pixel_values
+        patch_embed = self.visual.patch_embed
+        in_channels = patch_embed.proj.in_channels
+        patch_size = patch_embed.patch_size
+        temporal_patch_size = patch_embed.temporal_patch_size
+        total_patches = sum(t * h * w for t, h, w in grid_config)
+        flattened_patch_size = (
+            in_channels * temporal_patch_size * patch_size * patch_size
+        )
+        dummy_pixel_values = torch.randn(
+            total_patches, flattened_patch_size, device=device, dtype=dtype
+        )
+
+        # Override max_seqlen with a safe upper bound for capture.
+        # max_seqlen.item() gets baked into the CUDA graph (not replayed),
+        # so the capture value must cover any replay scenario.
+        # Worst case: 1 item consuming the full budget ->
+        # seq_len = token_budget * spatial_merge_size^2.
+        # For window-attention, each local window is bounded by fixed geometry:
+        # (window_size / patch_size / spatial_merge_size)^2 windows in merged
+        # token space, multiplied by spatial_merge_size^2 to map back to the
+        # unmerged sequence length used by attention kernels.
+        vit_merger_window_size = (
+            self.visual.window_size
+            // self.visual.spatial_merge_size
+            // self.visual.patch_size
+        )
+        max_seqlen_window_override = vit_merger_window_size**2 * (spatial_merge_size**2)
+        buffers = self.visual.prepare_encoder_metadata(
+            grid_config,
+            max_batch_size=max_batch_size,
+            max_frames_per_batch=max_frames_per_batch,
+            max_window_seqs_per_batch=max_window_seqs_per_batch,
+            max_seqlen_override=token_budget * (spatial_merge_size**2),
+            max_seqlen_window_override=max_seqlen_window_override,
+            device=device,
+        )
+
+        # Just use image-modality dummy input_buffer for capturing, since it's also
+        # compatible for video inputs (has the same shape: [num_patches, C*T*P*P]).
+        mm_kwargs = {
+            "pixel_values": dummy_pixel_values,
+            "image_grid_thw": grid_config,
+        }
+
+        return EncoderCudaGraphCaptureInputs(
+            mm_kwargs=mm_kwargs,
+            buffers=buffers,
+        )
+
+    def prepare_encoder_cudagraph_replay_buffers(
+        self,
+        mm_kwargs: dict[str, Any],
+        max_batch_size: int,
+        max_frames_per_batch: int,
+    ):
+        modality = self.get_input_modality(mm_kwargs)
+        grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs)
+
+        if modality == "image":
+            buffers = self.visual.prepare_encoder_metadata(
+                grid_thw_list,
+                max_batch_size=max_batch_size,
+                max_window_seqs_per_batch=min(
+                    self.vllm_config.scheduler_config.max_num_batched_tokens,
+                    self.model_config.max_model_len,
+                ),
+            )
+        else:
+            buffers = self.visual.prepare_encoder_metadata(
+                grid_thw_list,
+                max_frames_per_batch=max_frames_per_batch,
+                max_window_seqs_per_batch=min(
+                    self.vllm_config.scheduler_config.max_num_batched_tokens,
+                    self.model_config.max_model_len,
+                ),
+            )
+
+        return EncoderCudaGraphReplayBuffers(buffers=buffers)
+
+    def encoder_cudagraph_forward(
+        self,
+        mm_kwargs: dict[str, Any],
+        buffers: dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return self.visual(pixel_values, grid_thw, encoder_metadata=buffers)
+
+    def encoder_eager_forward(
+        self,
+        mm_kwargs: dict[str, Any],
+    ) -> torch.Tensor:
+        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
+        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
+        return self.visual(pixel_values, grid_thw)
+
     def forward(
         self,
         input_ids: torch.Tensor | None,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3cce0733bf51..d38cd63b90b1 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -511,6 +511,10 @@
     "Phi4ForCausalLMV": ("phi4siglip", "Phi4ForCausalLMV"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),
+    "QianfanOCRForConditionalGeneration": (
+        "qianfan_ocr",
+        "QianfanOCRForConditionalGeneration",
+    ),
     "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
     "Qwen2_5_VLForConditionalGeneration": (
@@ -597,6 +601,7 @@
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "DeepSeekV4MTPModel": ("deepseek_v4_mtp", "DeepSeekV4MTP"),
+    "Gemma4MTPModel": ("gemma4_mtp", "Gemma4MTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
     "ExaoneMoeMTP": ("exaone_moe_mtp", "ExaoneMoeMTP"),
     "Exaone4_5_MTP": ("exaone4_5_mtp", "Exaone4_5_MTP"),
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 8f08f6c60713..a0e7e16a9bbf 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -41,6 +41,7 @@
 
 from .interfaces import SupportsPP
 from .utils import (
+    AutoWeightsLoader,
     PPMissingLayer,
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
@@ -382,55 +383,6 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-
-class Step3TextForCausalLM(nn.Module, SupportsPP):
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-
-        self.config = config
-        self.vllm_config = vllm_config
-
-        self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
-
-        if get_pp_group().is_last_rank:
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            self.logits_processor = LogitsProcessor(config.vocab_size)
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor | None,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-    ):
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-        return hidden_states
-
-    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        logits = self.logits_processor(self.lm_head, hidden_states)
-        return logits
-
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         qkv_params_mapping = [
             # (param_name, shard_name, relative_start_idx, relative_end_idx)
@@ -553,3 +505,56 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                         weight_loader(param, loaded_weight)
                         loaded_params.add(name)
         return loaded_params
+
+
+class Step3TextForCausalLM(nn.Module, SupportsPP):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.vllm_config = vllm_config
+
+        self.model = Step3TextModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 0b844d1493d9..a0269be855a9 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -45,7 +45,10 @@ def set_weight_attrs(
 
 
 def replace_parameter(
-    layer: torch.nn.Module, param_name: str, new_data: torch.Tensor | None
+    layer: torch.nn.Module,
+    param_name: str,
+    new_data: torch.Tensor | None,
+    prefer_copy: bool = False,
 ):
     """
     Replace a parameter of a layer while maintaining the ability to reload the weight.
@@ -57,6 +60,12 @@ def replace_parameter(
         layer: Layer containing parameter to replace
         param_name: Name of parameter to replace
         new_data: New data of the new parameter, or None to set the parameter to None
+        prefer_copy: If True and the existing parameter is compatible with
+            ``new_data`` (same shape, dtype, and device), copy ``new_data``
+            into the existing parameter in place rather than re-registering
+            a new parameter. This preserves the parameter's storage address
+            (``data_ptr``), which is required for captured CUDA graphs to
+            remain valid across weight updates (e.g. in RL training loops).
     """
     # should not be used on a tied/shared param
 
@@ -67,9 +76,21 @@ def replace_parameter(
 
     if isinstance(new_data, torch.nn.Parameter):
         new_data = new_data.data
-    new_param = torch.nn.Parameter(new_data, requires_grad=False)
 
     old_param: torch.nn.Parameter | None = getattr(layer, param_name, None)
+
+    if (
+        prefer_copy
+        and old_param is not None
+        and old_param.shape == new_data.shape
+        and old_param.dtype == new_data.dtype
+        and old_param.device == new_data.device
+    ):
+        old_param.copy_(new_data)
+        return
+
+    new_param = torch.nn.Parameter(new_data, requires_grad=False)
+
     if old_param is not None and hasattr(old_param, "weight_loader"):
         weight_loader = old_param.weight_loader
         set_weight_attrs(new_param, {"weight_loader": weight_loader})
diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py
index ef9710374d81..bed66d0a4e9d 100644
--- a/vllm/multimodal/processing/context.py
+++ b/vllm/multimodal/processing/context.py
@@ -268,28 +268,6 @@ def call_hf_processor(
         try:
             output = hf_processor(**data, **allowed_kwargs)
         except Exception as exc:
-            # See https://github.com/huggingface/tokenizers/issues/537
-            if (
-                isinstance(exc, RuntimeError)
-                and exc
-                and exc.args[0] == "Already borrowed"
-                and num_tries < max_tries
-            ):
-                logger.warning(
-                    "Failed to acquire tokenizer in current thread. "
-                    "Retrying (%d/%d)...",
-                    num_tries,
-                    max_tries,
-                )
-                time.sleep(0.5)
-                return self.call_hf_processor(
-                    hf_processor,
-                    data,
-                    kwargs,
-                    num_tries=num_tries + 1,
-                    max_tries=max_tries,
-                )
-
             msg = (
                 f"Failed to apply {type(hf_processor).__name__} "
                 f"on data={data} with kwargs={allowed_kwargs}"
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a2c8c24ec035..6fdae470839f 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -111,7 +111,15 @@ def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
             return False
 
         mm_config = model_config.get_multimodal_config()
-        info = self._create_processing_info(model_config, tokenizer=None)
+        try:
+            info = self._create_processing_info(model_config, tokenizer=None)
+        except ValueError:
+            logger.warning_once(
+                "Model %s is treated as multimodal but has no registered "
+                "multimodal processor; running in text-only mode.",
+                model_config.model,
+            )
+            return False
 
         # Check if all supported modalities have limit == 0
         if all(
@@ -170,7 +178,11 @@ def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal":
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        assert hasattr(model_cls, "_processor_factory")
+        if not hasattr(model_cls, "_processor_factory"):
+            raise ValueError(
+                f"Model class {model_cls.__name__} has no registered "
+                "multimodal processor"
+            )
         return cast("SupportsMultiModal", model_cls)
 
     def _create_processing_ctx(
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 2c71d2afb1b5..aa6d12768ccc 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -121,6 +121,7 @@ def __init__(
         num_cached_tokens: int | None = None,
         *,
         kv_transfer_params: dict[str, Any] | None = None,
+        prompt_routed_experts: np.ndarray | None = None,
         # Forward compatibility, code that uses args added in new release can
         # still run with older versions of vLLM without breaking.
         **kwargs: Any,
@@ -141,12 +142,15 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
         self.kv_transfer_params = kv_transfer_params
+        self.prompt_routed_experts = prompt_routed_experts
 
     def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
         """Merge subsequent RequestOutput into this one"""
 
         self.finished |= next_output.finished
         self.kv_transfer_params = next_output.kv_transfer_params
+        if next_output.prompt_routed_experts is not None:
+            self.prompt_routed_experts = next_output.prompt_routed_experts
 
         for next_completion in next_output.outputs:
             for i, completion in enumerate(self.outputs):
diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py
index e7f83686dbef..46acc7fb155e 100644
--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -459,7 +459,8 @@ def _parse_tool_calls(
             (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
         ):
             # Forced Function Call
-            assert content is not None
+            if content is None:
+                return [], None
             function_calls.append(
                 FunctionCall(name=self._get_function_name(request), arguments=content)
             )
@@ -586,9 +587,15 @@ def _extract_tool_calls_streaming(
         tool_call_id_type: str = "random",
         function_name_returned: bool = False,
     ) -> tuple[DeltaMessage | None, bool]:
-        if request.tool_choice and isinstance(
-            request.tool_choice,
-            (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
+        assert self._tool_parser is not None
+        supports_required_and_named = self._tool_parser.supports_required_and_named
+        if (
+            supports_required_and_named
+            and request.tool_choice
+            and isinstance(
+                request.tool_choice,
+                (ToolChoiceFunction, ChatCompletionNamedToolChoiceParam),
+            )
         ):
             delta_message, function_name_returned = extract_named_tool_call_streaming(
                 delta_text=delta_text,
@@ -600,7 +607,7 @@ def _extract_tool_calls_streaming(
             )
             return delta_message, function_name_returned
 
-        if request.tool_choice == "required":
+        if supports_required_and_named and request.tool_choice == "required":
             delta_message, function_name_returned = (
                 extract_required_tool_call_streaming(
                     previous_text=previous_text,
@@ -635,15 +642,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
     def _in_reasoning_phase(self, state: StreamState) -> bool:
         if self._reasoning_parser is None:
             return False
-        if self._tool_parser is None:
-            return True
         return not state.reasoning_ended
 
     def _in_tool_call_phase(self, state: StreamState) -> bool:
         if self._tool_parser is None:
             return False
-        if self._reasoning_parser is None:
-            return True
         return state.reasoning_ended
 
     def parse_delta(
@@ -657,7 +660,9 @@ def parse_delta(
 
         if not state.prompt_reasoning_checked and prompt_token_ids is not None:
             state.prompt_reasoning_checked = True
-            if self.is_reasoning_end(prompt_token_ids):
+            if self._reasoning_parser is None or self.is_reasoning_end(
+                prompt_token_ids
+            ):
                 state.reasoning_ended = True
 
         current_text = state.previous_text + delta_text
@@ -707,9 +712,19 @@ def parse_delta(
                     function_name_returned=state.function_name_returned,
                 )
             )
-
-        # No parsers: pass through as content
-        if self._reasoning_parser is None and self._tool_parser is None:
+            if (
+                delta_message
+                and delta_message.tool_calls
+                and delta_message.tool_calls[0].id is not None
+            ):
+                state.history_tool_call_cnt += 1
+
+        # No phase active: pass through as content
+        if (
+            delta_message is None
+            and not self._in_reasoning_phase(state)
+            and not self._in_tool_call_phase(state)
+        ):
             delta_message = DeltaMessage(content=delta_text)
 
         state.previous_text = current_text
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index b99510a66414..750b7f2f4b9a 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -292,9 +292,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
     @classmethod
     def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
-        # TODO: CPU still sets block_size in check_and_update_config.
-        # Move that logic here so block_size is chosen by the backend.
-        pass
+        model_config = vllm_config.model_config
+        if model_config is None or not model_config.is_hybrid:
+            return
+
+        # reconcile attention and mamba page sizes
+        backend_cls = cls._find_non_ssm_backend(vllm_config)
+        if backend_cls is None:
+            return
+
+        cls._align_hybrid_block_size(vllm_config, backend_cls)
 
     @classmethod
     def discover_numa_topology(cls) -> list[list[int]]:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 4f9b9d7bf234..9f04bf11660a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -578,7 +578,9 @@ def get_default_ir_op_priority(cls, vllm_config: VllmConfig) -> IrOpPriorityConf
         if envs.VLLM_USE_OINK_OPS:
             rms_norm = ["oink"] + default
 
-        return IrOpPriorityConfig.with_default(default, rms_norm=rms_norm)
+        return IrOpPriorityConfig.with_default(
+            default, rms_norm=rms_norm, fused_add_rms_norm=rms_norm
+        )
 
 
 # NVML utils
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 2753326755fb..21de0cafb4f1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -545,6 +545,42 @@ def _align_hybrid_block_size(
                 dtype=kv_cache_dtype,
                 kv_quant_mode=kv_quant_mode,
             ).page_size_bytes
+        elif cache_config.cache_dtype.startswith("turboquant_"):
+            # TQ has a packed K|V layout; the standard FullAttentionSpec
+            # formula over-sizes it and trips unify_kv_cache_spec_page_size
+            # when all attention layers are TQ. With mixed skip+TQ the skip
+            # layers still use the standard layout — take max so mamba
+            # padding covers the largest actual page.
+            from vllm.model_executor.layers.quantization.turboquant.config import (
+                TurboQuantConfig,
+            )
+            from vllm.v1.kv_cache_interface import TQFullAttentionSpec
+
+            tq_cfg = TurboQuantConfig.from_cache_dtype(
+                cache_config.cache_dtype, model_config.get_head_size()
+            )
+            tq_page = TQFullAttentionSpec(
+                block_size=1,
+                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                head_size=model_config.get_head_size(),
+                head_size_v=model_config.get_head_size(),
+                dtype=kv_cache_dtype,
+                kv_quant_mode=kv_quant_mode,
+                tq_slot_size=tq_cfg.slot_size_aligned,
+            ).page_size_bytes
+            if cache_config.kv_cache_dtype_skip_layers:
+                skip_page = FullAttentionSpec(
+                    block_size=1,
+                    num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                    head_size=model_config.get_head_size(),
+                    dtype=model_config.dtype,
+                ).page_size_bytes
+                # lcm, not max: skip_page is often not a multiple of
+                # tq_page, so max would leave per-layer page sizes
+                # un-unifiable downstream.
+                attn_page_size_1_token = lcm(tq_page, skip_page)
+            else:
+                attn_page_size_1_token = tq_page
         else:
             attn_page_size_1_token = FullAttentionSpec(
                 block_size=1,
@@ -575,6 +611,16 @@ def _align_hybrid_block_size(
             else None
         )
 
+        if cache_config.mamba_cache_mode != "all":
+            # REQUEST_CONSTANT Mamba modes use a separate compact pool sized by
+            # the physical Mamba state page. They no longer require attention
+            # pages to be inflated to match Mamba pages, nor Mamba pages to be
+            # padded to the attention page size.
+            if cache_config.mamba_cache_mode == "align":
+                cache_config.mamba_block_size = cache_config.block_size
+            cache_config.mamba_page_size_padded = None
+            return
+
         # Get kernel block alignment from the backend's supported sizes
         with set_current_vllm_config(vllm_config):
             kernel_block_alignment_size = max(
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 866b9ffd1a6d..984b706e7041 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -409,6 +409,7 @@ class RocmPlatform(Platform):
         "gptq",
         "gptq_marlin",  # will be overwritten with gptq
         "fp8",
+        "deepseek_v4_fp8",
         "compressed-tensors",
         "fbgemm_fp8",
         "gguf",
@@ -693,21 +694,11 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     @classmethod
     def apply_config_platform_defaults(cls, vllm_config: "VllmConfig") -> None:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.config.compilation import CUDAGraphMode
 
         compilation_config = vllm_config.compilation_config
-        is_eager_execution = compilation_config.cudagraph_mode == CUDAGraphMode.NONE
         use_aiter_fused_moe = rocm_aiter_ops.is_fused_moe_enabled()
-        use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled()
         use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enabled()
         use_aiter_fused_se = rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
-        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
-        if (
-            use_aiter_rms_norm
-            and not is_eager_execution
-            and "-rms_norm" not in compilation_config.custom_ops
-        ):
-            compilation_config.custom_ops.append("+rms_norm")
 
         if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops:
             compilation_config.custom_ops.append("+quant_fp8")
@@ -939,7 +930,7 @@ def use_custom_op_collectives(cls) -> bool:
     def get_default_ir_op_priority(
         cls, vllm_config: "VllmConfig"
     ) -> "IrOpPriorityConfig":
-        from vllm.config.compilation import CompilationMode
+        from vllm.config.compilation import CompilationMode, CUDAGraphMode
         from vllm.config.kernel import IrOpPriorityConfig
 
         # Native used by default when compiling,
@@ -949,12 +940,10 @@ def get_default_ir_op_priority(
         using_inductor = cc.backend == "inductor" and cc.mode != CompilationMode.NONE
         default = ["native"] if using_inductor else ["vllm_c", "native"]
 
-        # This (mostly) preserves previous CustomOp behavior
-        # Necessary on ROCm because it's common that users
-        # enable rms_norm to use the aiter kernel.
+        #  Aiter rms norm perform best when CUDA Graph capture is enabled.
         # TODO(luka/TJ) remove env vars completely
         if (
-            cc.is_custom_op_enabled("rms_norm")
+            cc.cudagraph_mode != CUDAGraphMode.NONE
             and envs.VLLM_ROCM_USE_AITER
             and envs.VLLM_ROCM_USE_AITER_RMSNORM
         ):
@@ -962,7 +951,9 @@ def get_default_ir_op_priority(
         else:
             rms_norm = default
 
-        return IrOpPriorityConfig.with_default(default, rms_norm=rms_norm)
+        return IrOpPriorityConfig.with_default(
+            default, rms_norm=rms_norm, fused_add_rms_norm=rms_norm
+        )
 
     @classmethod
     @with_amdsmi_context
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index bd9006f3f8fc..613c4dce1f22 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -294,14 +294,23 @@ def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
                 new_block_size * attn_page_size_1_token
             )
         cache_config.block_size = new_block_size
-        logger.info(
-            "[XPU]Setting attention block size to %d tokens to ensure multiple of %d, "
-            "set mamba_page_size_padded to %d bytes accordingly, before was %d bytes.",
-            new_block_size,
-            kernel_block_size,
-            cache_config.mamba_page_size_padded,
-            original_mamba_page_size_padded,
-        )
+        if original_mamba_page_size_padded is None:
+            logger.info(
+                "[XPU]Setting attention block size to %d tokens to ensure "
+                "multiple of %d; mamba_page_size_padded remains unset.",
+                new_block_size,
+                kernel_block_size,
+            )
+        else:
+            logger.info(
+                "[XPU]Setting attention block size to %d tokens to ensure "
+                "multiple of %d, set mamba_page_size_padded to %d bytes "
+                "accordingly, before was %d bytes.",
+                new_block_size,
+                kernel_block_size,
+                cache_config.mamba_page_size_padded,
+                original_mamba_page_size_padded,
+            )
 
     @classmethod
     def support_hybrid_kv_cache(cls) -> bool:
diff --git a/vllm/reasoning/kimi_k2_reasoning_parser.py b/vllm/reasoning/kimi_k2_reasoning_parser.py
index 7a92703426fc..0b64c5c62ea1 100644
--- a/vllm/reasoning/kimi_k2_reasoning_parser.py
+++ b/vllm/reasoning/kimi_k2_reasoning_parser.py
@@ -221,6 +221,10 @@ def extract_reasoning_streaming(
             return None
 
         if self._end_token_id in delta_token_ids:
+            if self._end_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                # Wait for the next delta when the text becomes visible.
+                return None
             end_index = delta_text.find(self._end_token)
             reasoning = delta_text[:end_index]
             content = delta_text[end_index + len(self._end_token) :]
@@ -229,6 +233,9 @@ def extract_reasoning_streaming(
             )
 
         if self._tool_section_start_token_id in delta_token_ids:
+            if self._tool_section_start_token not in delta_text:
+                # Token ID arrived before text was flushed (stop-sequence buffering).
+                return None
             tool_index = delta_text.find(self._tool_section_start_token)
             reasoning = delta_text[:tool_index]
             content = delta_text[tool_index:]
diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index 5824957c5b00..41d8c0075fb1 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
-import copy
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Mapping, Sequence
@@ -108,17 +107,10 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
         if mm_registry.supports_multimodal_inputs(config.model_config):
             mm_processor_cache = mm_registry.processor_cache_from_config(config)
 
-            # Deep-copy the tokenizer so the multimodal processor gets its
-            # own Rust tokenizer backend.  Without this, concurrent access
-            # from AsyncMicrobatchTokenizer and call_hf_processor causes
-            # "RuntimeError: Already borrowed" from the Rust RefCell.
-            # See: https://github.com/huggingface/tokenizers/issues/537
-            mm_tokenizer = copy.deepcopy(tokenizer)
-
             with set_default_torch_num_threads():
                 self.mm_processor = mm_registry.create_processor(
                     config.model_config,
-                    tokenizer=mm_tokenizer,
+                    tokenizer=self.tokenizer,
                     cache=mm_processor_cache,
                 )
 
@@ -130,11 +122,10 @@ def __init__(self, config: "VllmConfig", tokenizer: _T | None) -> None:
             # requests don't pollute the sender cache.
             ro_cache = mm_registry.processor_only_cache_from_config(config)
             if ro_cache is not None:
-                ro_tokenizer = copy.deepcopy(tokenizer)
                 with set_default_torch_num_threads():
                     self._readonly_mm_processor = mm_registry.create_processor(
                         config.model_config,
-                        tokenizer=ro_tokenizer,
+                        tokenizer=self.tokenizer,
                         cache=ro_cache,
                     )
 
diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py
index bff473fa6417..6425bc647a1c 100644
--- a/vllm/renderers/hf.py
+++ b/vllm/renderers/hf.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from __future__ import annotations
 
+import copy
 import inspect
 import itertools
 import weakref
@@ -42,7 +43,7 @@
     apply_token_matches,
     find_mm_placeholders,
 )
-from vllm.tokenizers.hf import HfTokenizer
+from vllm.tokenizers.hf import HfTokenizer, maybe_make_thread_pool
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils.async_utils import make_async
@@ -785,6 +786,14 @@ def __init__(
         config: VllmConfig,
         tokenizer: HfTokenizer | None,
     ) -> None:
+        # Ensure the og tokenizer is never modified by maybe_make_thread_pool
+        tokenizer = copy.copy(tokenizer)
+        if (
+            # Skip for mock configs and tokenizers
+            getattr(config.model_config, "enable_prompt_embeds", False)
+            and isinstance(tokenizer, HfTokenizer)
+        ):
+            _ensure_prompt_embeds_placeholder_token(tokenizer)
         super().__init__(config, tokenizer)
 
         self.use_unified_vision_chunk = getattr(
@@ -795,6 +804,11 @@ def __init__(
             safe_apply_chat_template, executor=self._executor
         )
 
+        if self.tokenizer is not None:
+            maybe_make_thread_pool(
+                self.tokenizer, config.model_config.renderer_num_workers + 1
+            )
+
     def render_messages(
         self,
         messages: list[ChatCompletionMessageParam],
diff --git a/vllm/renderers/registry.py b/vllm/renderers/registry.py
index df35987b24cc..fff580bd6268 100644
--- a/vllm/renderers/registry.py
+++ b/vllm/renderers/registry.py
@@ -22,8 +22,9 @@
 _VLLM_RENDERERS = {
     "deepseek_v32": ("deepseek_v32", "DeepseekV32Renderer"),
     "deepseek_v4": ("deepseek_v4", "DeepseekV4Renderer"),
-    "hf": ("hf", "HfRenderer"),
+    "fastokens": ("hf", "HfRenderer"),
     "grok2": ("grok2", "Grok2Renderer"),
+    "hf": ("hf", "HfRenderer"),
     "kimi_audio": ("hf", "HfRenderer"),
     "mistral": ("mistral", "MistralRenderer"),
     "qwen_vl": ("hf", "HfRenderer"),
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
index 2daba409881f..6531989a9f35 100644
--- a/vllm/tokenizers/__init__.py
+++ b/vllm/tokenizers/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from .hf import maybe_make_thread_pool
 from .protocol import TokenizerLike
 from .registry import (
     TokenizerRegistry,
@@ -15,4 +16,5 @@
     "cached_get_tokenizer",
     "get_tokenizer",
     "cached_tokenizer_from_config",
+    "maybe_make_thread_pool",
 ]
diff --git a/vllm/tokenizers/fastokens.py b/vllm/tokenizers/fastokens.py
new file mode 100644
index 000000000000..f708aa3a70e8
--- /dev/null
+++ b/vllm/tokenizers/fastokens.py
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""``fastokens`` tokenizer mode.
+
+Loads a Hugging Face fast tokenizer whose internal Rust tokenizer is replaced
+by the fastokens shim. fastokens also rebinds
+``tokenizers.decoders.DecodeStream`` so the streaming detokenizer accepts the
+shim. Both patches are installed for the lifetime of the process —
+``patch_transformers()`` is idempotent.
+"""
+
+from pathlib import Path
+
+from .hf import CachedHfTokenizer, HfTokenizer
+from .protocol import TokenizerLike
+
+
+def _apply_fastokens_patch() -> None:
+    try:
+        import fastokens
+    except ImportError as e:
+        raise ImportError(
+            "The 'fastokens' package is required for tokenizer_mode='fastokens'."
+        ) from e
+    fastokens.patch_transformers()
+
+
+class FastokensTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> HfTokenizer:
+        _apply_fastokens_patch()
+        return CachedHfTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            download_dir=download_dir,
+            **kwargs,
+        )
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
index 85c812398529..b4248e229a68 100644
--- a/vllm/tokenizers/hf.py
+++ b/vllm/tokenizers/hf.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
+import queue
 from pathlib import Path
-from typing import TypeAlias
+from typing import TypeAlias, TypeVar
 
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
@@ -12,6 +13,92 @@
 from .protocol import TokenizerLike
 
 HfTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast
+_T = TypeVar("_T", bound=TokenizerLike)
+
+
+class ThreadSafeHFTokenizerMixin:
+    """Mixin class for thread-safe HF fast tokenizers."""
+
+    pass
+
+
+def maybe_make_thread_pool(tokenizer: _T, copies: int = 1):
+    """
+    If `tokenizer` is a `PreTrainedTokenizerFast`, modify the tokenizer
+    in-place to make the public interface thread-safe by routing calls
+    through a deep-copied tokenizer pool.
+
+    Note that:
+    - Only ``TokenizerLike``'s public interface is thread-safe.
+      This doesn't include ``_tokenizer`` property nor any mutation
+      methods like ``add_special_tokens`` or ``add_tokens``.
+    - Adjacent method calls could happen on different deep copies.
+    """
+    if not isinstance(tokenizer, PreTrainedTokenizerFast) or isinstance(
+        tokenizer, ThreadSafeHFTokenizerMixin
+    ):
+        return tokenizer
+
+    og_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_pool: queue.Queue[PreTrainedTokenizerFast] = queue.Queue()
+    for _ in range(copies):
+        tokenizer_pool.put(copy.deepcopy(og_tokenizer))
+
+    @contextlib.contextmanager
+    def _borrow_from_pool():
+        try:
+            tok = tokenizer_pool.get_nowait()
+            yield tok
+        except queue.Empty:
+            tok = copy.deepcopy(og_tokenizer)
+            yield tok
+        finally:
+            tokenizer_pool.put(tok)
+
+    class TokenizerPool(tokenizer.__class__, ThreadSafeHFTokenizerMixin):  # type: ignore
+        def apply_chat_template(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.apply_chat_template(*args, **kwargs)
+
+        def batch_decode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.batch_decode(*args, **kwargs)
+
+        def batch_encode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.batch_encode(*args, **kwargs)
+
+        def convert_tokens_to_ids(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_tokens_to_ids(*args, **kwargs)
+
+        def convert_ids_to_tokens(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_ids_to_tokens(*args, **kwargs)
+
+        def convert_tokens_to_string(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.convert_tokens_to_string(*args, **kwargs)
+
+        def decode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.decode(*args, **kwargs)
+
+        def encode(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok.encode(*args, **kwargs)
+
+        def __call__(self, *args, **kwargs):
+            with _borrow_from_pool() as tok:
+                return tok(*args, **kwargs)
+
+        def __reduce__(self):
+            return maybe_make_thread_pool, (og_tokenizer, copies)
+
+    TokenizerPool.__name__ = f"TokenizerPool{og_tokenizer.__class__.__name__}"
+
+    tokenizer.__class__ = TokenizerPool
 
 
 def get_cached_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
@@ -103,7 +190,10 @@ def from_pretrained(
                     "is a custom tokenizer not yet available in the "
                     "HuggingFace transformers library, consider "
                     "setting `trust_remote_code=True` in LLM or using "
-                    "the `--trust-remote-code` flag in the CLI."
+                    "the `--trust-remote-code` flag in the CLI. If the "
+                    "model was created with a newer version of "
+                    "transformers, consider upgrading: "
+                    "`uv pip install --upgrade transformers`"
                 )
                 raise RuntimeError(err_msg) from e
             else:
diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
index ef58b1b75d68..8fce690433ef 100644
--- a/vllm/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -13,7 +13,6 @@
 from mistral_common.protocol.instruct.request import (
     ReasoningEffort,
 )
-from mistral_common.protocol.instruct.tool_calls import Function, Tool
 from mistral_common.protocol.instruct.validator import ValidationMode
 from mistral_common.tokens.tokenizers.base import (
     SpecialTokenPolicy,
@@ -68,36 +67,6 @@ def _pop_unallowed_keys_and_warn(
             )
 
 
-# TODO(juliendenize): remove this once OpenAI API is better supported by
-# `mistral-common`.
-def adapt_inplace_to_mistral_tool(
-    tool: dict[str, Any],
-) -> dict[str, Any]:
-    tools_fields = set(Tool.model_fields.keys())
-    function_fields = set(Function.model_fields.keys())
-
-    # The Mistral client, in comparison to the OpenAI client, requires the
-    # "parameters" dict and the "description" string to be present
-    # even if they are empty.
-    if function := tool.get("function"):
-        if function.get("parameters") is None:
-            function["parameters"] = {}
-        if function.get("description") is None:
-            function["description"] = ""
-
-        _pop_unallowed_keys_and_warn(
-            dictionary=function,
-            allowed_keys=function_fields,
-            err_dict_name="function",
-        )
-
-    _pop_unallowed_keys_and_warn(
-        dictionary=tool, allowed_keys=tools_fields, err_dict_name="tools"
-    )
-
-    return tool
-
-
 def maybe_serialize_tool_calls(request: "MistralChatCompletionRequest"):
     # SEE: https://github.com/vllm-project/vllm/pull/9951
     # Credits go to: @gcalmettes
@@ -167,12 +136,11 @@ def truncate_tool_call_ids(request: "MistralChatCompletionRequest"):
                 request.messages[i]["tool_call_id"] = tool_call_id
 
 
-def _prepare_apply_chat_template_tools_and_messages(
+def _validate_apply_chat_template_args(
     messages: list["ChatCompletionMessageParam"],
-    tools: list[dict[str, Any]] | None = None,
     continue_final_message: bool = False,
     add_generation_prompt: bool = False,
-) -> tuple[list["ChatCompletionMessageParam"], list[dict[str, Any]] | None]:
+) -> None:
     if add_generation_prompt and continue_final_message:
         raise ValueError(
             "Cannot set both `add_generation_prompt` and "
@@ -196,21 +164,6 @@ def _prepare_apply_chat_template_tools_and_messages(
             "the last message is not from the assistant."
         )
 
-    # mistral-common requires AssistantMessage content to be string [1].
-    #
-    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
-    for message in messages:
-        # Remove reasoning as unsupported by Mistral
-        _ = message.pop("reasoning", None)  # type: ignore
-
-    tools = (
-        [adapt_inplace_to_mistral_tool(tool=tool) for tool in tools]
-        if tools is not None
-        else None
-    )
-
-    return messages, tools
-
 
 def validate_request_params(request: "ChatCompletionRequest"):
     if request.chat_template is not None or request.chat_template_kwargs is not None:
@@ -449,8 +402,8 @@ def apply_chat_template(
         if self.version >= 15:
             version_kwargs["reasoning_effort"] = kwargs.get("reasoning_effort")
 
-        messages, tools = _prepare_apply_chat_template_tools_and_messages(
-            messages, tools, continue_final_message, add_generation_prompt
+        _validate_apply_chat_template_args(
+            messages, continue_final_message, add_generation_prompt
         )
 
         return self.transformers_tokenizer.apply_chat_template(
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
index 8778aa9d691f..88ad15efeda4 100644
--- a/vllm/tokenizers/registry.py
+++ b/vllm/tokenizers/registry.py
@@ -43,6 +43,7 @@
 _VLLM_TOKENIZERS = {
     "deepseek_v32": ("deepseek_v32", "DeepseekV32Tokenizer"),
     "deepseek_v4": ("deepseek_v4", "DeepseekV4Tokenizer"),
+    "fastokens": ("fastokens", "FastokensTokenizer"),
     "grok2": ("grok2", "Grok2Tokenizer"),
     "hf": ("hf", "CachedHfTokenizer"),
     "kimi_audio": ("kimi_audio", "KimiAudioTokenizer"),
diff --git a/vllm/tool_parsers/__init__.py b/vllm/tool_parsers/__init__.py
index f64209e535b7..7c5f45d2022e 100644
--- a/vllm/tool_parsers/__init__.py
+++ b/vllm/tool_parsers/__init__.py
@@ -94,6 +94,10 @@
         "jamba_tool_parser",
         "JambaToolParser",
     ),
+    "lfm2": (
+        "lfm2_tool_parser",
+        "Lfm2ToolParser",
+    ),
     "kimi_k2": (
         "kimi_k2_tool_parser",
         "KimiK2ToolParser",
diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
index 75181d8dfac6..c3438082a72d 100644
--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import importlib
+import json
 import os
 from collections.abc import Callable, Sequence
 from functools import cached_property
@@ -13,6 +14,7 @@
 from openai.types.responses.function_tool import FunctionTool
 
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionToolsParam,
 )
@@ -23,6 +25,7 @@
 from vllm.entrypoints.openai.responses.protocol import (
     ResponsesRequest,
 )
+from vllm.envs import VLLM_ENFORCE_STRICT_TOOL_CALLING
 from vllm.logger import init_logger
 from vllm.sampling_params import (
     StructuredOutputsParams,
@@ -83,13 +86,39 @@ def vocab(self) -> dict[str, int]:
         return self.model_tokenizer.get_vocab()
 
     def adjust_request(
-        self, request: ChatCompletionRequest | ResponsesRequest
+        self,
+        request: ChatCompletionRequest | ResponsesRequest,
     ) -> ChatCompletionRequest | ResponsesRequest:
-        """
-        Static method that used to adjust the request parameters.
-        """
+        # If there are no tools, return the request as is.
         if not request.tools:
             return request
+
+        # Step 1 (highest priority for ChatCompletionRequest): apply
+        # vLLM-owned structural tag support for model-specific tool formats.
+        if (
+            isinstance(request, ChatCompletionRequest)
+            and VLLM_ENFORCE_STRICT_TOOL_CALLING
+        ):
+            need_tool_calling = (
+                request.tool_choice == "auto"
+                or request.tool_choice == "required"
+                or isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam)
+            )
+            if need_tool_calling:
+                structure_tag = self.get_structural_tag(request)
+                if structure_tag is not None:
+                    if request.structured_outputs is None:
+                        request.structured_outputs = StructuredOutputsParams(
+                            structural_tag=json.dumps(structure_tag.model_dump()),
+                        )
+                    else:
+                        request.structured_outputs.structural_tag = json.dumps(
+                            structure_tag.model_dump()
+                        )
+                    return request
+
+        # Step 2: set structured output params when tool constraints are
+        # derived from the tool schema.
         json_schema_from_tool = get_json_schema_from_tools(
             tool_choice=request.tool_choice, tools=request.tools
         )
@@ -121,6 +150,9 @@ def adjust_request(
 
         return request
 
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return None
+
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
     ) -> ExtractedToolCallInformation:
diff --git a/vllm/tool_parsers/deepseekv32_tool_parser.py b/vllm/tool_parsers/deepseekv32_tool_parser.py
index 02182e22935a..f01f7f929426 100644
--- a/vllm/tool_parsers/deepseekv32_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv32_tool_parser.py
@@ -69,7 +69,7 @@ def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
             r'<｜DSML｜invoke\s+name="([^"]+)"\s*>(.*?)</｜DSML｜invoke>', re.DOTALL
         )
         self.parameter_complete_regex = re.compile(
-            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(?:true|false)"\s*>(.*?)</｜DSML｜parameter>',
+            r'<｜DSML｜parameter\s+name="([^"]+)"\s+string="(true|false)"\s*>(.*?)</｜DSML｜parameter>',
             re.DOTALL,
         )
 
@@ -101,10 +101,12 @@ def _generate_tool_call_id(self) -> str:
         """Generate a unique tool call ID."""
         return f"call_{uuid.uuid4().hex[:24]}"
 
-    def _parse_invoke_params(self, invoke_str: str) -> dict:
-        param_dict = dict()
-        for param_name, param_val in self.parameter_complete_regex.findall(invoke_str):
-            param_dict[param_name] = param_val
+    def _parse_invoke_params(self, invoke_str: str) -> dict[str, tuple[str, str]]:
+        param_dict: dict[str, tuple[str, str]] = {}
+        for param_name, string_attr, param_val in self.parameter_complete_regex.findall(
+            invoke_str
+        ):
+            param_dict[param_name] = (param_val, string_attr)
         return param_dict
 
     def _convert_param_value_checked(self, value: str, param_type: str) -> Any:
@@ -142,10 +144,32 @@ def _convert_param_value(self, value: str, param_type: str | list[str]) -> Any:
         # return value as fallback
         return value
 
+    @staticmethod
+    def _repair_param_dict(
+        param_dict: dict[str, Any],
+        param_config: dict[str, dict],
+    ) -> dict[str, Any]:
+        """Unwrap single 'arguments' / 'input' wrappers when the wrapper
+        is not part of the requested tool schema and the wrapped object
+        matches the schema fields."""
+        allowed = set(param_config.keys())
+        for wrapper in ("arguments", "input"):
+            if set(param_dict.keys()) != {wrapper} or wrapper in allowed:
+                continue
+            inner = param_dict[wrapper]
+            if isinstance(inner, str):
+                try:
+                    inner = json.loads(inner)
+                except json.JSONDecodeError:
+                    return param_dict
+            if isinstance(inner, dict) and set(inner.keys()).issubset(allowed):
+                return inner
+        return param_dict
+
     def _convert_params_with_schema(
         self,
         function_name: str,
-        param_dict: dict[str, str],
+        param_dict: dict[str, tuple[str, str]],
     ) -> dict[str, Any]:
         """Convert raw string param values using the tool schema types."""
         param_config: dict = {}
@@ -162,12 +186,16 @@ def _convert_params_with_schema(
                     break
 
         converted: dict[str, Any] = {}
-        for name, value in param_dict.items():
+        for name, (value, string_attr) in param_dict.items():
+            if string_attr == "true":
+                converted[name] = value
+                continue
+
             param_type = "string"
             if name in param_config and isinstance(param_config[name], dict):
                 param_type = param_config[name].get("type", "string")
             converted[name] = self._convert_param_value(value, param_type)
-        return converted
+        return self._repair_param_dict(converted, param_config)
 
     def extract_tool_calls(
         self,
diff --git a/vllm/tool_parsers/deepseekv4_tool_parser.py b/vllm/tool_parsers/deepseekv4_tool_parser.py
index 45a9c1302578..e32451cd8bbd 100644
--- a/vllm/tool_parsers/deepseekv4_tool_parser.py
+++ b/vllm/tool_parsers/deepseekv4_tool_parser.py
@@ -1,7 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
 from vllm.tool_parsers.deepseekv32_tool_parser import DeepSeekV32ToolParser
+from vllm.tool_parsers.structural_tag_registry import (
+    get_enable_structured_outputs_in_reasoning,
+    get_model_structural_tag,
+)
 
 
 class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
@@ -14,3 +21,11 @@ class DeepSeekV4ToolParser(DeepSeekV32ToolParser):
 
     tool_call_start_token: str = "<｜DSML｜tool_calls>"
     tool_call_end_token: str = "</｜DSML｜tool_calls>"
+
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return get_model_structural_tag(
+            model="deepseek_v4",
+            tools=request.tools,
+            tool_choice=request.tool_choice,
+            reasoning=get_enable_structured_outputs_in_reasoning(),
+        )
diff --git a/vllm/tool_parsers/lfm2_tool_parser.py b/vllm/tool_parsers/lfm2_tool_parser.py
new file mode 100644
index 000000000000..ee92d060fbea
--- /dev/null
+++ b/vllm/tool_parsers/lfm2_tool_parser.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import ast
+from collections.abc import Sequence
+
+import regex as re
+
+import vllm.envs as envs
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+)
+from vllm.entrypoints.openai.engine.protocol import (
+    DeltaMessage,
+    ExtractedToolCallInformation,
+)
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
+from vllm.tool_parsers.abstract_tool_parser import (
+    Tool,
+    ToolParser,
+)
+from vllm.tool_parsers.utils import (
+    UnexpectedAstError,
+    compute_tool_delta,
+    handle_single_tool,
+    make_valid_python,
+)
+
+logger = init_logger(__name__)
+
+TOOL_CALL_START = "<|tool_call_start|>"
+TOOL_CALL_END = "<|tool_call_end|>"
+
+
+class Lfm2ToolParser(ToolParser):
+    """
+    Tool call parser for LiquidAI LFM2/LFM2.5 models that produce pythonic
+    tool calls wrapped in <|tool_call_start|> and <|tool_call_end|> tokens.
+
+    Example model output:
+        <|tool_call_start|>[get_weather(location="Paris")]<|tool_call_end|>
+        The weather in Paris is sunny.
+
+    Used when --enable-auto-tool-choice --tool-call-parser lfm2 are all set.
+    """
+
+    TOOL_CALL_REGEX = re.compile(r"\[.*\]$", re.DOTALL)
+
+    def __init__(
+        self,
+        tokenizer: TokenizerLike,
+        tools: list[Tool] | None = None,
+    ):
+        super().__init__(tokenizer, tools)
+
+        self.tool_call_start_token_id = self.vocab.get(TOOL_CALL_START)
+        self.tool_call_end_token_id = self.vocab.get(TOOL_CALL_END)
+
+        if self.tool_call_start_token_id is None or self.tool_call_end_token_id is None:
+            raise RuntimeError(
+                "LFM2 tool parser could not locate "
+                "<|tool_call_start|>/<|tool_call_end|> tokens in the "
+                "tokenizer!"
+            )
+
+        # Trailing content already emitted to the client. Used by the
+        # streaming path to suppress LFM2's frequent echo of the tool
+        # call body after the first <|tool_call_end|> while still
+        # allowing legitimate post-call prose through.
+        self._trailing_emitted: str = ""
+
+    def adjust_request(
+        self, request: ChatCompletionRequest | ResponsesRequest
+    ) -> ChatCompletionRequest | ResponsesRequest:
+        request = super().adjust_request(request)
+        if request.tools and request.tool_choice != "none":
+            # The <|tool_call_start|>/<|tool_call_end|> sentinels are
+            # registered as special tokens in the LFM2/LFM2.5 tokenizer.
+            # With the default ``skip_special_tokens=True`` they are
+            # stripped from the decoded text before reaching this parser,
+            # so the tool block becomes invisible. Force the engine to
+            # preserve them when tool calling is enabled.
+            request.skip_special_tokens = False
+        return request
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    @staticmethod
+    def _strip_echo(raw_after: str) -> str:
+        """Drop any orphan <|tool_call_end|> (and the preceding text) from
+        trailing content. LFM2 occasionally echoes the call body after the
+        first end token and caps it with a second end token; everything
+        through the last such orphan is model garbage, not user content."""
+        last_orphan = raw_after.rfind(TOOL_CALL_END)
+        if last_orphan != -1:
+            return raw_after[last_orphan + len(TOOL_CALL_END) :]
+        return raw_after
+
+    @classmethod
+    def _extract_tool_call_text(
+        cls, model_output: str
+    ) -> tuple[str | None, str | None]:
+        """Extract the pythonic call text and surrounding content.
+
+        Returns (tool_text, content) where tool_text is the text between
+        the sentinel tokens and content is everything outside them.
+        """
+        start_idx = model_output.find(TOOL_CALL_START)
+        if start_idx == -1:
+            return None, model_output
+
+        end_idx = model_output.find(TOOL_CALL_END, start_idx)
+        if end_idx == -1:
+            # Incomplete — treat entire text after start as tool call
+            tool_text = model_output[start_idx + len(TOOL_CALL_START) :]
+            content_before = model_output[:start_idx].strip()
+            content = content_before or None
+            return tool_text, content
+
+        tool_text = model_output[start_idx + len(TOOL_CALL_START) : end_idx]
+        content_before = model_output[:start_idx].strip()
+        content_after = cls._strip_echo(
+            model_output[end_idx + len(TOOL_CALL_END) :]
+        ).strip()
+
+        content_parts = []
+        if content_before:
+            content_parts.append(content_before)
+        if content_after:
+            content_parts.append(content_after)
+        content = "\n".join(content_parts) if content_parts else None
+
+        return tool_text, content
+
+    def extract_tool_calls(
+        self, model_output: str, request: ChatCompletionRequest
+    ) -> ExtractedToolCallInformation:
+        tool_text, content = self._extract_tool_call_text(model_output)
+
+        if tool_text is None:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        tool_text = tool_text.strip()
+
+        is_tool_call_pattern = False
+        try:
+            is_tool_call_pattern = (
+                self.TOOL_CALL_REGEX.match(
+                    tool_text,
+                    timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS,
+                )
+                is not None
+            )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+
+        if not is_tool_call_pattern:
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+        try:
+            module = ast.parse(tool_text)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=content,
+                )
+            else:
+                raise UnexpectedAstError("Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            return ExtractedToolCallInformation(
+                tools_called=False, tool_calls=[], content=model_output
+            )
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> DeltaMessage | None:
+        # If the tool call start token hasn't appeared yet, stream as content.
+        if TOOL_CALL_START not in current_text:
+            return DeltaMessage(content=delta_text)
+
+        # Compute leading content (before <|tool_call_start|>) that arrived
+        # in this delta and hasn't been streamed yet. Without this, when the
+        # prefix and the start token land in the same delta the prefix is
+        # silently dropped — token-by-token streaming masked the bug because
+        # the prefix tokens always arrived in earlier deltas.
+        leading_content = ""
+        if TOOL_CALL_START not in previous_text:
+            start_idx = current_text.find(TOOL_CALL_START)
+            # previous_text contained no start token, so it has already been
+            # streamed via the no-start-token branch above.
+            leading_content = current_text[len(previous_text) : start_idx]
+
+        has_end_in_current = TOOL_CALL_END in current_text
+        has_end_in_previous = TOOL_CALL_END in previous_text
+
+        # Compute trailing content (after <|tool_call_end|>) not yet
+        # streamed. LFM2 frequently echoes the tool call body again
+        # after the first end token, capped with a second end token.
+        # Suppress that echo:
+        #   - If a second <|tool_call_end|> has appeared, treat
+        #     everything through the last one as garbage.
+        #   - If the trailing starts with `[` or `<` (potential echo
+        #     body or another sentinel) and no second end token has
+        #     arrived yet, buffer it instead of emitting.
+        trailing_content = ""
+        if has_end_in_current:
+            end_idx = current_text.find(TOOL_CALL_END) + len(TOOL_CALL_END)
+            full_trailing = current_text[end_idx:]
+            stripped_trailing = self._strip_echo(full_trailing)
+            if stripped_trailing == full_trailing:
+                # No second end token yet — possibly mid-echo.
+                lstripped = full_trailing.lstrip()
+                if lstripped.startswith("[") or lstripped.startswith("<"):
+                    # Suspect echo; hold off until resolved.
+                    final_trailing = self._trailing_emitted
+                else:
+                    final_trailing = full_trailing
+            else:
+                final_trailing = stripped_trailing
+            if final_trailing.startswith(self._trailing_emitted):
+                trailing_content = final_trailing[len(self._trailing_emitted) :]
+            self._trailing_emitted = final_trailing
+
+        # If tools were already parsed in a prior delta, just stream any
+        # newly arrived trailing content.
+        if has_end_in_current and self.prev_tool_call_arr and has_end_in_previous:
+            if trailing_content:
+                return DeltaMessage(content=trailing_content)
+            return DeltaMessage(content="")
+
+        # Extract the pythonic text between start and end tokens.
+        tool_text = current_text.split(TOOL_CALL_START, 1)[1]
+        # Strip the end token if present (entire call arrived at once).
+        if TOOL_CALL_END in tool_text:
+            tool_text = tool_text.split(TOOL_CALL_END, 1)[0]
+
+        def _content_only_or_none() -> DeltaMessage | None:
+            """Return a content-only delta if any content arrived in this
+            chunk, otherwise None. Used on incremental-parse failure paths
+            so leading/trailing content is never silently dropped.
+            """
+            combined = leading_content + trailing_content
+            return DeltaMessage(content=combined) if combined else None
+
+        try:
+            valid_and_added_text = make_valid_python(tool_text)
+            if valid_and_added_text is None:
+                return _content_only_or_none()
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                isinstance(e, ast.Call) for e in parsed.elts
+            ):
+                raise UnexpectedAstError("Tool output must be a list of function calls")
+            tool_calls = [
+                handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = (
+                    index < len(tool_calls) - 1 or ")]" not in added_text
+                )
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = added_text[:-2] if not new_call_complete else ""
+                if not new_call_complete and added_text[-2] == ")":
+                    withheld_suffix = withheld_suffix + "}"
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = compute_tool_delta(
+                    self.streamed_args_for_tool[index],
+                    new_call,
+                    index,
+                    withheld_suffix,
+                )
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (
+                        delta.function is not None
+                        and delta.function.arguments is not None
+                    ):
+                        self.streamed_args_for_tool[index] += delta.function.arguments
+
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            combined_content = leading_content + trailing_content
+
+            if tool_deltas or combined_content:
+                return DeltaMessage(
+                    content=combined_content if combined_content else None,
+                    tool_calls=tool_deltas,
+                )
+            elif not added_text and self.current_tool_id > 0:
+                return DeltaMessage(content="")
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction error"
+            )
+            return _content_only_or_none()
diff --git a/vllm/tool_parsers/mistral_tool_parser.py b/vllm/tool_parsers/mistral_tool_parser.py
index 94b7b678979c..0a057a3af468 100644
--- a/vllm/tool_parsers/mistral_tool_parser.py
+++ b/vllm/tool_parsers/mistral_tool_parser.py
@@ -43,7 +43,7 @@
 from vllm.reasoning.mistral_reasoning_parser import MistralReasoningParser
 from vllm.sampling_params import StructuredOutputsParams
 from vllm.tokenizers import TokenizerLike
-from vllm.tokenizers.mistral import MistralTokenizer, adapt_inplace_to_mistral_tool
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.tool_parsers.abstract_tool_parser import (
     Tool,
     ToolParser,
@@ -241,12 +241,7 @@ def adjust_request(
         )
 
         mistral_tools = (
-            [
-                MistralTool.model_validate(
-                    adapt_inplace_to_mistral_tool(tool.model_dump())
-                )
-                for tool in request.tools
-            ]
+            [MistralTool.from_openai(tool.model_dump()) for tool in request.tools]
             if request.tools is not None
             else None
         )
@@ -623,13 +618,6 @@ def _extract_tool_calls_streaming(
         if len(delta_tool_calls) > 0:
             delta.tool_calls = delta_tool_calls
 
-        # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining its final streaming delta, automatically
-        # adding autocompleted JSON.
-        # These two lines avoid that nonsense while ensuring finish_reason
-        # is set to tool_calls when at least one tool is called.
-        if delta_tool_calls and not self.prev_tool_call_arr:
-            self.prev_tool_call_arr = [{"arguments": {}}]
         return delta
 
     def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
@@ -642,6 +630,8 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
             StreamingState.PARSING_ARGUMENTS,
         ] and delta_text.startswith(self.bot_token):
             self.current_tool_id += 1
+            self.streamed_args_for_tool.append("")
+            self.prev_tool_call_arr.append({})
             self.streaming_state = StreamingState.PARSING_NAME
             delta_text = delta_text.replace(self.bot_token, "", 1)
         if self.streaming_state == StreamingState.PARSING_NAME:
@@ -655,6 +645,9 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
                 self.current_tool_name += delta_function_name
                 # HF tokenizers may include [ARGS] in the text
                 self.current_tool_name = self.current_tool_name.replace("[ARGS]", "")
+                self.prev_tool_call_arr[self.current_tool_id]["name"] = (
+                    self.current_tool_name
+                )
                 delta_text = delta_text[len(delta_function_name) :]
                 self.streaming_state = StreamingState.PARSING_ARGUMENTS
             else:
@@ -671,6 +664,10 @@ def _generate_delta_tool_call(self, delta_text: str) -> list[DeltaToolCall]:
                 self.streaming_state = StreamingState.TOOL_COMPLETE
             else:
                 delta_arguments = delta_text
+            self.streamed_args_for_tool[self.current_tool_id] += delta_arguments
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = (
+                self.streamed_args_for_tool[self.current_tool_id]
+            )
             ret = []
             if self.current_tool_name or delta_arguments:
                 ret += [
@@ -820,9 +817,12 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
                     if self.current_tool_mistral_id is not None:
                         current_tool_call.id = self.current_tool_mistral_id
                         self.current_tool_mistral_id = None
+                    self._track_streamed_args_pre_v11(current_tool_call)
                     delta_tool_calls.append(current_tool_call)
                 current_tool_call_modified = False
                 self.current_tool_id += 1
+                self.streamed_args_for_tool.append("")
+                self.prev_tool_call_arr.append({})
                 self.current_tool_mistral_id = MistralToolCall.generate_random_id()
                 current_tool_call = DeltaToolCall(
                     index=self.current_tool_id,
@@ -835,6 +835,9 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
                 # we have the complete tool name
                 current_tool_call_modified = True
                 current_tool_call.function.name = self.current_tool_name
+                self.prev_tool_call_arr[self.current_tool_id]["name"] = (
+                    self.current_tool_name
+                )
                 self.current_tool_name = None
             if self.streaming_state == StreamingState.PARSING_NAME_COMPLETED:
                 self.streaming_state = StreamingState.WAITING_FOR_TOOL_KEY
@@ -860,16 +863,9 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
             if self.current_tool_mistral_id is not None:
                 current_tool_call.id = self.current_tool_mistral_id
                 self.current_tool_mistral_id = None
+            self._track_streamed_args_pre_v11(current_tool_call)
             delta_tool_calls.append(current_tool_call)
 
-        # HACK: serving_chat.py inspects the internal state of tool parsers
-        # when determining it's final streaming delta, automatically
-        # adding autocompleted JSON.
-        # These two lines avoid that nonsense while ensuring finish_reason
-        # is set to tool_calls when at least one tool is called.
-        if delta_tool_calls and not self.prev_tool_call_arr:
-            self.prev_tool_call_arr = [{"arguments": {}}]
-
         if content or len(delta_tool_calls) > 0:
             delta_message = DeltaMessage()
             if content:
@@ -883,6 +879,16 @@ def _extract_tool_calls_streaming_pre_v11_tokenizer(
             else:
                 return None
 
+    def _track_streamed_args_pre_v11(self, tool_call: DeltaToolCall) -> None:
+        r"""Accumulate `tool_call` arguments into the streaming state."""
+        if tool_call.function is not None and tool_call.function.arguments is not None:
+            self.streamed_args_for_tool[self.current_tool_id] += (
+                tool_call.function.arguments
+            )
+            self.prev_tool_call_arr[self.current_tool_id]["arguments"] = (
+                self.streamed_args_for_tool[self.current_tool_id]
+            )
+
     def _split_delta(
         self,
         delta_text: str,
diff --git a/vllm/tool_parsers/qwen3coder_tool_parser.py b/vllm/tool_parsers/qwen3coder_tool_parser.py
index 7b089ceffbc0..73850b2ab0c5 100644
--- a/vllm/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/tool_parsers/qwen3coder_tool_parser.py
@@ -25,12 +25,18 @@
     Tool,
     ToolParser,
 )
+from vllm.tool_parsers.structural_tag_registry import (
+    get_enable_structured_outputs_in_reasoning,
+    get_model_structural_tag,
+)
 from vllm.tool_parsers.utils import find_tool_properties
 
 logger = init_logger(__name__)
 
 
 class Qwen3CoderToolParser(ToolParser):
+    supports_required_and_named: bool = False
+
     def __init__(self, tokenizer: TokenizerLike, tools: list[Tool] | None = None):
         super().__init__(tokenizer, tools)
 
@@ -681,3 +687,11 @@ def extract_tool_calls_streaming(
                 return result
 
         return None
+
+    def get_structural_tag(self, request: ChatCompletionRequest):
+        return get_model_structural_tag(
+            model="qwen_3_5",
+            tools=request.tools,
+            tool_choice=request.tool_choice,
+            reasoning=get_enable_structured_outputs_in_reasoning(),
+        )
diff --git a/vllm/tool_parsers/structural_tag_registry.py b/vllm/tool_parsers/structural_tag_registry.py
new file mode 100644
index 000000000000..754cc52361c5
--- /dev/null
+++ b/vllm/tool_parsers/structural_tag_registry.py
@@ -0,0 +1,330 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Model-specific structural tag builders adapted from XGrammar's
+# builtin structural tag implementations:
+# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/builtin_structural_tag.py
+
+from collections.abc import Callable
+from typing import Any, Literal
+
+from xgrammar import StructuralTag
+from xgrammar.structural_tag import (
+    AnyTextFormat,
+    ConstStringFormat,
+    JSONSchemaFormat,
+    SequenceFormat,
+    TagFormat,
+    TagsWithSeparatorFormat,
+    TriggeredTagsFormat,
+)
+
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
+    ChatCompletionToolsParam,
+)
+
+SimplifiedToolChoice = Literal["auto", "required", "forced"]
+ToolChoice = (
+    Literal["none", "auto", "required"] | ChatCompletionNamedToolChoiceParam | None
+)
+StructuralTagBuilder = Callable[
+    [list[ChatCompletionToolsParam], SimplifiedToolChoice, bool],
+    StructuralTag,
+]
+
+_structural_tag_registry: dict[str, StructuralTagBuilder] = {}
+
+
+def register_model_structural_tag(name: str):
+    """Register a vLLM-owned model-specific structural tag builder."""
+
+    def decorator(func: StructuralTagBuilder) -> StructuralTagBuilder:
+        _structural_tag_registry[name] = func
+        return func
+
+    return decorator
+
+
+def get_model_structural_tag(
+    model: str,
+    tools: list[ChatCompletionToolsParam] | None,
+    tool_choice: ToolChoice,
+    reasoning: bool,
+) -> StructuralTag | None:
+    """Build a structural tag from vLLM-owned model-specific builders."""
+
+    builder = _structural_tag_registry.get(model)
+    if builder is None:
+        supported = list(_structural_tag_registry.keys())
+        raise ValueError(f"Unknown format type: {model}, supported types: {supported}")
+
+    normalized_tools, simplified_tool_choice = _normalize_tool_choice(
+        tools=tools,
+        tool_choice=tool_choice,
+    )
+    if not normalized_tools:
+        return None
+
+    return builder(normalized_tools, simplified_tool_choice, reasoning)
+
+
+def _normalize_tool_choice(
+    tools: list[ChatCompletionToolsParam] | None,
+    tool_choice: ToolChoice,
+) -> tuple[list[ChatCompletionToolsParam], SimplifiedToolChoice]:
+    """Normalize vLLM ChatCompletion tool_choice for structural tag builders."""
+
+    if not tools:
+        return [], "auto"
+
+    if tool_choice is None or tool_choice == "none":
+        return [], "auto"
+
+    if tool_choice == "auto":
+        return tools, "auto"
+
+    if tool_choice == "required":
+        return tools, "required"
+
+    if isinstance(tool_choice, ChatCompletionNamedToolChoiceParam):
+        tool_name = tool_choice.function.name
+        filtered_tools = [tool for tool in tools if tool.function.name == tool_name]
+        if not filtered_tools:
+            raise ValueError(
+                f"The tool with name '{tool_name}' is not found in the tools list."
+            )
+        return filtered_tools, "forced"
+
+    raise ValueError(f"Unsupported tool_choice for structural tag: {tool_choice}")
+
+
+def _get_function_parameters(function: Any) -> dict[str, Any] | bool:
+    """Return the JSON schema used for constrained tool arguments."""
+
+    if getattr(function, "strict", None) is False:
+        return True
+    if function.parameters is None:
+        return True
+    return function.parameters
+
+
+_enable_structured_outputs_in_reasoning: bool = False
+
+
+def set_enable_structured_outputs_in_reasoning(enabled: bool) -> None:
+    """Publish the engine's ``enable_in_reasoning`` flag to tool parsers.
+
+    Called once during APIServer startup so request-time parsers can read
+    it without going through the EngineCore-only contextvar.
+    """
+
+    global _enable_structured_outputs_in_reasoning
+    _enable_structured_outputs_in_reasoning = bool(enabled)
+
+
+def get_enable_structured_outputs_in_reasoning() -> bool:
+    """Whether structured outputs are active during the reasoning phase.
+
+    When ``True``, the structural tag will cover the reasoning part:
+    ``<think>...</think>`` prefix (if available); when ``False`` (default), the tag only
+    constrains the post-reasoning suffix.
+    """
+
+    return _enable_structured_outputs_in_reasoning
+
+
+@register_model_structural_tag("deepseek_v4")
+def get_deepseek_v4_structural_tag(
+    tools: list[ChatCompletionToolsParam],
+    tool_choice: SimplifiedToolChoice,
+    reasoning: bool,
+) -> StructuralTag:
+    """Build DeepSeek V4 structural tags."""
+
+    invoke_begin_prefix = '<｜DSML｜invoke name="'
+    invoke_begin_suffix = '">\n'
+    invoke_end = "</｜DSML｜invoke>\n"
+    tool_calls_prefix = "\n\n"
+    function_calls_begin = "<｜DSML｜tool_calls>\n"
+    function_calls_end = "</｜DSML｜tool_calls>"
+    function_calls_trigger = "<｜DSML｜tool_calls>"
+    think_tag_end = "</think>"
+    think_exclude_tokens = ["<think>", "</think>"]
+    xml_style = "deepseek_xml"
+
+    if tool_choice == "auto":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=parameters,
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                )
+            )
+
+        if tags:
+            function_calling_tags = TagsWithSeparatorFormat(
+                tags=tags,
+                separator="\n",
+                at_least_one=True,
+            )
+            suffix_tag = TriggeredTagsFormat(
+                triggers=[function_calls_trigger],
+                tags=[
+                    TagFormat(
+                        begin=function_calls_begin,
+                        content=function_calling_tags,
+                        end=function_calls_end,
+                    )
+                ],
+                excludes=think_exclude_tokens,
+            )
+        else:
+            suffix_tag = AnyTextFormat(excludes=think_exclude_tokens)
+
+    elif tool_choice == "forced":
+        if not tools:
+            raise ValueError("Forced tool choice must resolve to exactly one tool.")
+        function = tools[0].function
+        suffix_tag = SequenceFormat(
+            elements=[
+                ConstStringFormat(value=tool_calls_prefix + function_calls_begin),
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=_get_function_parameters(function),
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                ),
+                ConstStringFormat(value=function_calls_end),
+            ]
+        )
+
+    elif tool_choice == "required":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=invoke_begin_prefix + function.name + invoke_begin_suffix,
+                    content=JSONSchemaFormat(
+                        json_schema=parameters,
+                        style=xml_style,
+                    ),
+                    end=invoke_end,
+                )
+            )
+        assert len(tags) > 0
+        suffix_tag = SequenceFormat(
+            elements=[
+                ConstStringFormat(value=tool_calls_prefix + function_calls_begin),
+                TagsWithSeparatorFormat(
+                    tags=tags,
+                    separator="\n",
+                    at_least_one=True,
+                ),
+                ConstStringFormat(value=function_calls_end),
+            ]
+        )
+
+    if not reasoning:
+        return StructuralTag(format=suffix_tag)
+
+    prefix_tag = TagFormat(begin="", content=AnyTextFormat(), end=think_tag_end)
+    return StructuralTag(format=SequenceFormat(elements=[prefix_tag, suffix_tag]))
+
+
+@register_model_structural_tag("qwen_3_5")
+def get_qwen_3_5_structural_tag(
+    tools: list[ChatCompletionToolsParam],
+    tool_choice: SimplifiedToolChoice,
+    reasoning: bool,
+) -> StructuralTag:
+    """Build Qwen XML structural tags.
+
+    This format is used for Qwen3-Coder/Qwen3.5/Qwen3.6 and is compatible with
+    Qwen variants that use the same XML tool-call format.
+    """
+    tool_call_begin_prefix = "<tool_call>\n<function="
+    tool_call_begin_suffix = ">\n"
+    tool_call_end = "\n</function>\n</tool_call>"
+    tool_call_trigger = "<tool_call>\n<function="
+    think_tag_end = "</think>"
+    think_suffix = "\n\n"
+    think_exclude_tokens = ["<think>", "</think>"]
+
+    if tool_choice == "auto":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+                    content=JSONSchemaFormat(json_schema=parameters, style="qwen_xml"),
+                    end=tool_call_end,
+                )
+            )
+
+        if tags:
+            suffix_tag = TriggeredTagsFormat(
+                triggers=[tool_call_trigger],
+                tags=tags,
+                excludes=think_exclude_tokens,
+            )
+        else:
+            suffix_tag = AnyTextFormat(excludes=think_exclude_tokens)
+
+    elif tool_choice == "forced":
+        if not tools:
+            raise ValueError("Forced tool choice must resolve to exactly one tool.")
+        function = tools[0].function
+        suffix_tag = TagFormat(
+            begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+            content=JSONSchemaFormat(
+                json_schema=_get_function_parameters(function),
+                style="qwen_xml",
+            ),
+            end=tool_call_end,
+        )
+
+    elif tool_choice == "required":
+        tags = []
+        for tool in tools:
+            function = tool.function
+            parameters = _get_function_parameters(function)
+            tags.append(
+                TagFormat(
+                    begin=f"{tool_call_begin_prefix}{function.name}{tool_call_begin_suffix}",
+                    content=JSONSchemaFormat(json_schema=parameters, style="qwen_xml"),
+                    end=tool_call_end,
+                )
+            )
+        assert len(tags) > 0
+        suffix_tag = TagsWithSeparatorFormat(
+            tags=tags,
+            separator="",
+            at_least_one=True,
+        )
+
+    if not reasoning:
+        result = StructuralTag(format=suffix_tag)
+    else:
+        prefix_tag = SequenceFormat(
+            elements=[
+                TagFormat(begin="", content=AnyTextFormat(), end=think_tag_end),
+                ConstStringFormat(value=think_suffix),
+            ]
+        )
+        result = StructuralTag(format=SequenceFormat(elements=[prefix_tag, suffix_tag]))
+
+    return result
diff --git a/vllm/tool_parsers/utils.py b/vllm/tool_parsers/utils.py
index 439441690d04..464ed40f948b 100644
--- a/vllm/tool_parsers/utils.py
+++ b/vllm/tool_parsers/utils.py
@@ -308,20 +308,43 @@ def get_parameter_value(val: ast.expr) -> Any:
         raise UnexpectedAstError("Tool call arguments must be literals")
 
 
+def _ast_callable_dotted_name(node: ast.expr) -> str:
+    """Return the dotted name for a call target, walking ``ast.Attribute``
+    chains so ``a.b.c(...)`` becomes ``"a.b.c"``.
+
+    Raises:
+        UnexpectedAstError: If the chain does not bottom out in an
+            ``ast.Name`` (e.g. subscript or call expression as receiver).
+    """
+    parts: list[str] = []
+    current: ast.expr = node
+    while isinstance(current, ast.Attribute):
+        parts.append(current.attr)
+        current = current.value
+    if not isinstance(current, ast.Name):
+        raise UnexpectedAstError("Invalid tool call name")
+    parts.append(current.id)
+    return ".".join(reversed(parts))
+
+
 def handle_single_tool(call: ast.Call) -> ToolCall:
     """Convert a single AST function call node into a ToolCall object.
 
+    Accepts both bare names (``foo(...)``) and dotted attribute chains
+    (``a.b.c(...)``); the resulting tool call ``name`` field preserves the
+    dotted form.
+
     Raises:
-        UnexpectedAstError: If the call node does not have a simple
-            function name (e.g. it's an attribute access or subscript).
+        UnexpectedAstError: If the call target is neither a simple name
+            nor a chain of attribute accesses bottoming out in a name.
     """
-    if not isinstance(call.func, ast.Name):
+    if not isinstance(call.func, (ast.Name, ast.Attribute)):
         logger.warning(
             "Tool call has non-simple function name: %s",
             ast.dump(call.func),
         )
         raise UnexpectedAstError("Invalid tool call name")
-    function_name = call.func.id
+    function_name = _ast_callable_dotted_name(call.func)
     arguments = {}
     for keyword in call.keywords:
         arguments[keyword.arg] = get_parameter_value(keyword.value)
@@ -403,7 +426,28 @@ def make_valid_python(text: str) -> tuple[str, str] | None:
     for char in reversed(bracket_stack):
         added_text += _CLOSING[char]
 
-    return text + added_text, added_text
+    candidate = text + added_text
+
+    # Streaming partial text can land in shapes the bracket-counting
+    # heuristics above don't catch. Two failure modes:
+    #   1. Mid-key inside a dict (`..., "k`) closes to `..., "k"}` — a
+    #      syntactically invalid mixed dict/set.
+    #   2. A bare string inside a dict (`{"k`) closes to `{"k"}` — valid
+    #      Python but a *set* literal, which downstream tool-call AST
+    #      handling rejects.
+    # Validate the candidate parses, has a body, and contains no Set
+    # nodes (pythonic tool calls always use dicts for `{...}`).
+    try:
+        module = ast.parse(candidate)
+    except SyntaxError:
+        return None
+    if not module.body:
+        return None
+    for node in ast.walk(module):
+        if isinstance(node, ast.Set):
+            return None
+
+    return candidate, added_text
 
 
 def compute_tool_delta(
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2f00178ba6ef..e6c497c0b450 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -101,6 +101,7 @@ def __getitem__(self, key):
     fireredlid="FireRedLIDConfig",
     funaudiochat="FunAudioChatConfig",
     granite4_vision="Granite4VisionConfig",
+    hyperclovax_vlm="HCXVisionConfig",
     hunyuan_vl="HunYuanVLConfig",
     hy_v3="HYV3Config",
     isaac="IsaacConfig",
@@ -124,6 +125,7 @@ def __getitem__(self, key):
     step3_vl="Step3VLConfig",
     step3_text="Step3TextConfig",
     step3p5="Step3p5Config",
+    qianfan_ocr="QianfanOCRConfig",
     qwen3_asr="Qwen3ASRConfig",
     qwen3_next="Qwen3NextConfig",
     qwen3_5="Qwen3_5Config",
@@ -217,8 +219,9 @@ def parse(
             )
         else:
             if model_type in _CONFIG_REGISTRY:
-                # Register the config class to AutoConfig to ensure it's used in future
-                # calls to `from_pretrained`
+                # Register the config class to AutoConfig to ensure it's used
+                # in future calls to `from_pretrained` (e.g. from
+                # AutoTokenizer or AutoProcessor).
                 config_class = _CONFIG_REGISTRY[model_type]
                 config_class.model_type = model_type
                 AutoConfig.register(model_type, config_class, exist_ok=True)
@@ -398,6 +401,57 @@ def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> No
         config.rope_parameters["rope_theta"] = default_theta
 
 
+def patch_legacy_rope_type(rope_parameters: dict[str, Any] | None) -> None:
+    """Patch legacy RoPE type fields for backwards compatibility with
+    older custom models which would otherwise fail to load."""
+
+    # No RoPE parameters to patch
+    if rope_parameters is None:
+        return
+
+    def _patch_legacy_rope_type(rope_parameters: dict[str, Any]) -> None:
+        # Case 1: Both legacy and modern fields present - check for conflicts
+        if "rope_type" in rope_parameters and "type" in rope_parameters:
+            rope_type = rope_parameters["rope_type"]
+            rope_type_legacy = rope_parameters["type"]
+            if (rope_type_legacy == "su" and rope_type == "longrope") or (
+                rope_type_legacy == "mrope" and rope_type == "default"
+            ):
+                pass  # No action needed
+            elif rope_type != rope_type_legacy:
+                raise ValueError(
+                    f"Found conflicts between 'rope_type={rope_type}' (modern "
+                    f"field) and 'type={rope_type_legacy}' (legacy field). "
+                    "You should only specify one of them."
+                )
+        # Case 2: Only legacy field present - patch to modern format with warning
+        if "rope_type" not in rope_parameters and "type" in rope_parameters:
+            rope_parameters["rope_type"] = rope_parameters["type"]
+            logger.info("Replacing legacy 'type' key with 'rope_type'")
+        # Case 3: No rope_type field at all - cannot determine RoPE type, raise error
+        if "rope_type" not in rope_parameters:
+            raise ValueError("rope_parameters should have a 'rope_type' key")
+        # Patch legacy rope_type values with warning
+        if rope_parameters["rope_type"] == "su":
+            rope_parameters["rope_type"] = "longrope"
+            logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
+        elif rope_parameters["rope_type"] == "mrope":
+            if "mrope_section" not in rope_parameters:
+                raise ValueError(
+                    "Legacy rope_type 'mrope' requires "
+                    "'mrope_section' in rope_parameters"
+                )
+            rope_parameters["rope_type"] = "default"
+            logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
+
+    # Handle nested rope_parameters in interleaved sliding attention models
+    if is_rope_parameters_nested(rope_parameters):
+        for rope_parameters_layer_type in rope_parameters.values():
+            _patch_legacy_rope_type(rope_parameters_layer_type)
+    else:
+        _patch_legacy_rope_type(rope_parameters)
+
+
 def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
     from vllm.config.utils import getattr_iter
@@ -412,15 +466,9 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
 
     if Version(version("transformers")) < Version("5.0.0"):
         # Transformers v4 installed, legacy config fields may be present.
-        existing_rp = getattr(config, "rope_parameters", None)
-        if isinstance(existing_rp, dict) and is_rope_parameters_nested(existing_rp):
-            # Interleaved-attention models (e.g. Laguna-XS.2) ship a nested
-            # {layer_type: {...}} rope_parameters that the model code indexes
-            # by layer_type. The per-layer-type sub-dicts already carry the
-            # correct rope_theta / partial_rotary_factor / ompe (the converter
-            # places top-level legacy fields inside full_attention), so don't
-            # merge top-level fields here — that would shadow the per-type
-            # values and break sliding-attention layers.
+        if is_rope_parameters_nested(getattr(config, "rope_parameters", {})):
+            # Loading nested rope_parameters (from Transformers v5) in Transformers v4.
+            # Skip legacy patching since it should already be in the correct format.
             pass
         else:
             if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
@@ -438,6 +486,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
                 config.rope_parameters["partial_rotary_factor"] = partial_rotary_factor
             if ompe is not None:
                 config.rope_parameters["original_max_position_embeddings"] = ompe
+            patch_legacy_rope_type(getattr(config, "rope_parameters", None))
     elif rope_theta is not None or getattr(config, "rope_parameters", None):
         # Transformers v5 installed
         # Patch these fields in case they used non-standard names
@@ -446,54 +495,10 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
         if partial_rotary_factor is not None:
             config.partial_rotary_factor = partial_rotary_factor
         # Standardize and validate RoPE parameters
+        patch_legacy_rope_type(getattr(config, "rope_parameters", None))
         config.standardize_rope_params()
         config.validate_rope()
 
-    # No RoPE parameters to patch
-    if getattr(config, "rope_parameters", None) is None:
-        return
-
-    # Handle nested rope_parameters in interleaved sliding attention models
-    if is_rope_parameters_nested(config.rope_parameters):
-        for rope_parameters_layer_type in config.rope_parameters.values():
-            patch_rope_parameters_dict(rope_parameters_layer_type)
-    else:
-        patch_rope_parameters_dict(config.rope_parameters)
-
-
-def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
-    if "rope_type" in rope_parameters and "type" in rope_parameters:
-        rope_type = rope_parameters["rope_type"]
-        rope_type_legacy = rope_parameters["type"]
-        if (rope_type_legacy == "su" and rope_type == "longrope") or (
-            rope_type_legacy == "mrope" and rope_type == "default"
-        ):
-            pass  # No action needed
-        elif rope_type != rope_type_legacy:
-            raise ValueError(
-                f"Found conflicts between 'rope_type={rope_type}' (modern "
-                f"field) and 'type={rope_type_legacy}' (legacy field). "
-                "You should only specify one of them."
-            )
-
-    if "rope_type" not in rope_parameters and "type" in rope_parameters:
-        rope_parameters["rope_type"] = rope_parameters["type"]
-        logger.info("Replacing legacy 'type' key with 'rope_type'")
-
-    if "rope_type" not in rope_parameters:
-        raise ValueError("rope_parameters should have a 'rope_type' key")
-
-    if rope_parameters["rope_type"] == "su":
-        rope_parameters["rope_type"] = "longrope"
-        logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
-    elif rope_parameters["rope_type"] == "mrope":
-        if "mrope_section" not in rope_parameters:
-            raise ValueError(
-                "Legacy rope_type 'mrope' requires 'mrope_section' in rope_parameters"
-            )
-        rope_parameters["rope_type"] = "default"
-        logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
-
 
 def _uses_mrope(config: PretrainedConfig) -> bool:
     rope_parameters = getattr(config, "rope_parameters", None)
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 44abe32c916f..c3466fddd65a 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -37,6 +37,7 @@
     "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
     "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "HCXVisionConfig": "vllm.transformers_utils.configs.hyperclovax",
     "HYV3Config": "vllm.transformers_utils.configs.hy_v3",
     "HyperCLOVAXConfig": "vllm.transformers_utils.configs.hyperclovax",
     "IsaacConfig": "vllm.transformers_utils.configs.isaac",
@@ -69,6 +70,8 @@
     "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
     "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
     "Step3p5Config": "vllm.transformers_utils.configs.step3p5",
+    "QianfanOCRConfig": "vllm.transformers_utils.configs.qianfan_ocr",
+    "QianfanOCRVisionConfig": "vllm.transformers_utils.configs.qianfan_ocr",
     "Qwen3ASRConfig": "vllm.transformers_utils.configs.qwen3_asr",
     "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
     "Qwen3_5Config": "vllm.transformers_utils.configs.qwen3_5",
@@ -104,6 +107,7 @@
     "HunYuanVLConfig",
     "HunYuanVLTextConfig",
     "HunYuanVLVisionConfig",
+    "HCXVisionConfig",
     "HYV3Config",
     "HyperCLOVAXConfig",
     "IsaacConfig",
@@ -133,6 +137,8 @@
     "Step3VisionEncoderConfig",
     "Step3TextConfig",
     "Step3p5Config",
+    "QianfanOCRConfig",
+    "QianfanOCRVisionConfig",
     "Qwen3ASRConfig",
     "Qwen3NextConfig",
     "Qwen3_5Config",
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 03f24319e287..3d3e20fea856 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -101,7 +101,6 @@ class DeepseekVLV2TextConfig(DeepseekV2Config):
 
 class DeepseekVLV2Config(PretrainedConfig):
     model_type = "deepseek_vl_v2"
-    architectures: list[str] | None = None
 
     tile_tag: str = "2D"
     global_view_pos: str = "head"
@@ -114,17 +113,11 @@ def __init__(
         candidate_resolutions: tuple[tuple[int, int]] = ((384, 384),),
         **kwargs,
     ):
-        if "architectures" not in kwargs:
-            kwargs["architectures"] = ["DeepseekVLV2ForCausalLM"]
+        architectures = kwargs.setdefault("architectures", ["DeepseekVLV2ForCausalLM"])
 
-        vision_config = kwargs.pop("vision_config", {})
-        self.vision_config = VisionEncoderConfig(**vision_config)
-
-        projector_config = kwargs.pop("projector_config", {})
-        self.projector_config = MlpProjectorConfig(**projector_config)
-
-        language_config = kwargs.pop("language_config", {})
-        self.text_config = DeepseekVLV2TextConfig(**language_config)
+        self.vision_config = VisionEncoderConfig(**kwargs.pop("vision_config", {}))
+        self.projector_config = MlpProjectorConfig(**kwargs.pop("projector_config", {}))
+        self.text_config = DeepseekVLV2TextConfig(**kwargs.pop("language_config", {}))
 
         self.tile_tag = tile_tag
         self.global_view_pos = global_view_pos
@@ -132,8 +125,8 @@ def __init__(
         self.vocab_size = self.text_config.vocab_size
 
         # update model_type for OCR models
-        if "DeepseekOCRForCausalLM" in kwargs["architectures"]:
-            self.model_type = "deepseek_ocr"
-        elif "DeepseekOCR2ForCausalLM" in kwargs["architectures"]:
-            self.model_type = "deepseek_ocr2"
+        if "DeepseekOCRForCausalLM" in architectures:
+            kwargs["model_type"] = "deepseek_ocr"
+        elif "DeepseekOCR2ForCausalLM" in architectures:
+            kwargs["model_type"] = "deepseek_ocr2"
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/hyperclovax.py b/vllm/transformers_utils/configs/hyperclovax.py
index 9fa823743d66..d1a3218fe4dd 100644
--- a/vllm/transformers_utils/configs/hyperclovax.py
+++ b/vllm/transformers_utils/configs/hyperclovax.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 """HyperCLOVA X model configuration."""
 
+from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -275,3 +276,74 @@ def __init__(
             auto_map=auto_map,
             **kwargs,
         )
+
+
+class HCXVisionConfig(PretrainedConfig):
+    """Vendored HyperCLOVAX Vision config with transformers v5 fix.
+
+    The original remote code config does not handle empty initialization
+    (text_config=None), which breaks transformers v5's @strict validation.
+
+    TODO: Remove this class once HyperCLOVAX is upstreamed to transformers.
+    Tracking PR: https://github.com/huggingface/transformers/pull/44956
+    """
+
+    model_type = "hyperclovax_vlm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    text_config_attribute_map = {
+        "n_embd": "hidden_size",
+        "n_positions": "max_position_embeddings",
+        "n_head": "num_attention_heads",
+        "n_layer": "num_hidden_layers",
+    }
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        use_nth_layer=-2,
+        img_start_id=100009,
+        decoder_max_length=4096,
+        anyres=False,
+        unpad=False,
+        max_num_grids=-1,
+        num_queries_vis_abstractor=-1,
+        ignore_index=-100,
+        proj_pos_emb=True,
+        proj_prenorm=False,
+        use_1x1_grid=False,
+        **kwargs,
+    ):
+        for key, val in self.text_config_attribute_map.items():
+            if text_config is not None and key in text_config:
+                text_config[val] = text_config.pop(key)
+
+        self.text_config = None
+        if text_config is not None:
+            _text_config = AutoConfig.for_model(text_config["model_type"])
+            self.text_config = _text_config.from_dict(text_config)
+            self.hidden_size = self.text_config.hidden_size
+
+        self.vision_config = None
+        if vision_config is not None:
+            _vision_config = AutoConfig.for_model(vision_config["model_type"])
+            self.vision_config = _vision_config.from_dict(vision_config)
+
+        self.use_nth_layer = use_nth_layer
+        self.decoder_max_length = decoder_max_length
+        self.anyres = anyres
+        self.unpad = unpad
+        self.max_num_grids = max_num_grids
+        self.num_queries_vis_abstractor = num_queries_vis_abstractor
+        self.img_start_id = img_start_id
+        self.ignore_index = ignore_index
+        self.proj_pos_emb = proj_pos_emb
+        self.proj_prenorm = proj_prenorm
+        self.use_1x1_grid = use_1x1_grid
+        super().__init__(**kwargs)
+
+    def get_text_config(self, decoder=False):
+        if self.text_config is not None:
+            return self.text_config
+        return self
diff --git a/vllm/transformers_utils/configs/qianfan_ocr.py b/vllm/transformers_utils/configs/qianfan_ocr.py
new file mode 100644
index 000000000000..da004bb90f4f
--- /dev/null
+++ b/vllm/transformers_utils/configs/qianfan_ocr.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any
+
+from transformers import PretrainedConfig
+from transformers.models.auto import CONFIG_MAPPING
+
+
+class QianfanOCRVisionConfig(PretrainedConfig):
+    model_type = "qianfan_ocr_vision"
+
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 4096,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 16,
+        num_channels: int = 3,
+        image_size: int = 448,
+        patch_size: int = 14,
+        hidden_act: str = "gelu",
+        layer_norm_eps: float = 1e-6,
+        attention_dropout: float = 0.0,
+        drop_path_rate: float = 0.1,
+        qkv_bias: bool = True,
+        qk_normalization: bool = False,
+        norm_type: str = "layer_norm",
+        initializer_range: float = 0.02,
+        initializer_factor: float = 0.1,
+        use_mask_token: bool = False,
+        use_mean_pooling: bool = True,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.attention_dropout = attention_dropout
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.norm_type = norm_type
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.use_mask_token = use_mask_token
+        self.use_mean_pooling = use_mean_pooling
+
+
+class QianfanOCRConfig(PretrainedConfig):
+    model_type = "qianfan_ocr"
+
+    def __init__(
+        self,
+        vision_config: dict | None = None,
+        text_config: dict | None = None,
+        downsample_ratio: float = 0.5,
+        dynamic_image_size: bool = True,
+        force_image_size: int = 448,
+        image_token_id: int = 151671,
+        max_dynamic_patch: int = 12,
+        min_dynamic_patch: int = 1,
+        pad2square: bool = False,
+        ps_version: str = "v2",
+        select_layer: int = -1,
+        template: str = "internvl2_5",
+        use_thumbnail: bool = True,
+        tie_word_embeddings: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = QianfanOCRVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = QianfanOCRVisionConfig()
+        else:
+            self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            model_type = text_config.get("model_type", "qwen3")
+            self.text_config = CONFIG_MAPPING[model_type](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["qwen3"]()
+        else:
+            self.text_config = text_config
+
+        self.downsample_ratio = downsample_ratio
+        self.dynamic_image_size = dynamic_image_size
+        self.force_image_size = force_image_size
+        self.image_token_id = image_token_id
+        self.max_dynamic_patch = max_dynamic_patch
+        self.min_dynamic_patch = min_dynamic_patch
+        self.pad2square = pad2square
+        self.ps_version = ps_version
+        self.select_layer = select_layer
+        self.template = template
+        self.use_thumbnail = use_thumbnail
+        self.tie_word_embeddings = tie_word_embeddings
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index 79f28d815e20..35fa1313d1e7 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -512,6 +512,18 @@ def get_num_hidden_layers(self) -> int:
         return getattr(self.hf_text_config, "num_nextn_predict_layers", 1)
 
 
+class Gemma4MTPModelArchConfigConvertor(ModelArchConfigConvertorBase):
+    def get_hidden_size(self) -> int:
+        # The speculator buffer must match the backbone (target) model's
+        # hidden dimension, not the draft model's smaller dimension.
+        return getattr(
+            self.hf_config, "backbone_hidden_size", super().get_hidden_size()
+        )
+
+    def get_num_hidden_layers(self) -> int:
+        return getattr(self.hf_text_config, "num_hidden_layers", 0)
+
+
 class Gemma4ModelArchConfigConvertor(ModelArchConfigConvertorBase):
     def is_mm_prefix_lm(self) -> bool:
         return (
@@ -541,6 +553,7 @@ def get_head_size(self) -> int:
     "falcon": FalconModelArchConfigConvertor,
     "gemma4": Gemma4ModelArchConfigConvertor,
     "gemma4_text": Gemma4ModelArchConfigConvertor,
+    "gemma4_mtp": Gemma4MTPModelArchConfigConvertor,
     "RefinedWeb": FalconModelArchConfigConvertor,
     "RefinedWebModel": FalconModelArchConfigConvertor,
     "nemotron-nas": NemotronNasModelArchConfigConvertor,
diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py
index f05bc555bfdc..9b2634c90982 100644
--- a/vllm/triton_utils/importing.py
+++ b/vllm/triton_utils/importing.py
@@ -6,6 +6,7 @@
 from importlib.util import find_spec
 
 from vllm.logger import init_logger
+from vllm.utils.math_utils import cdiv
 
 logger = init_logger(__name__)
 
@@ -79,6 +80,7 @@ def __init__(self):
         self.autotune = self._dummy_decorator("autotune")
         self.heuristics = self._dummy_decorator("heuristics")
         self.Config = self._dummy_decorator("Config")
+        self.cdiv = cdiv
         self.language = TritonLanguagePlaceholder()
 
     def _dummy_decorator(self, name):
diff --git a/vllm/triton_utils/jit_monitor.py b/vllm/triton_utils/jit_monitor.py
new file mode 100644
index 000000000000..5ee33fc51dc4
--- /dev/null
+++ b/vllm/triton_utils/jit_monitor.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Monitor unexpected Triton kernel JIT compilation during inference.
+
+After server warmup completes, any Triton JIT compilation or autotuning
+event indicates a cache miss or unexpected input shape that causes a
+latency spike. This module registers hooks in the Triton runtime to
+detect and log such events so they can be investigated.
+
+Currently monitors:
+- Triton ``@triton.autotune`` cache misses (via ``knobs.autotuning.print``)
+- Triton ``@triton.jit`` first-time compilations
+  (via ``knobs.runtime.jit_post_compile_hook``)
+"""
+
+import os
+
+from vllm.logger import init_logger
+from vllm.triton_utils.importing import HAS_TRITON
+
+logger = init_logger(__name__)
+
+_active: bool = False
+
+
+def is_active() -> bool:
+    """Return whether the JIT compilation monitor is currently active."""
+    return _active
+
+
+def activate() -> None:
+    """Enable JIT compilation monitoring after warmup.
+
+    Call once per worker process at the end of
+    :func:`compile_or_warm_up_model`.  After activation every Triton
+    kernel compilation or autotuning benchmark that happens during
+    inference will be logged as a warning.
+
+    Safe to call multiple times — subsequent calls are no-ops.
+
+    If the user has explicitly set ``TRITON_PRINT_AUTOTUNING=0`` in
+    their environment, autotuning printing is left disabled; the JIT
+    compilation hook is still registered regardless.
+    """
+    global _active
+    if _active:
+        return
+    _active = True
+
+    _setup_triton_autotuning_print()
+    _setup_triton_jit_hook()
+
+    logger.info(
+        "Kernel JIT monitor activated — Triton JIT compilations "
+        "during inference will be logged as warnings."
+    )
+
+
+# ------------------------------------------------------------------
+# Triton autotuning print
+# ------------------------------------------------------------------
+
+
+def _setup_triton_autotuning_print() -> None:
+    """Enable ``TRITON_PRINT_AUTOTUNING`` unless the user opted out."""
+    if not HAS_TRITON:
+        return
+    from triton import knobs  # type: ignore[import-untyped]
+
+    user_val = os.environ.get("TRITON_PRINT_AUTOTUNING")
+    if user_val == "0":
+        logger.debug(
+            "TRITON_PRINT_AUTOTUNING=0 set by user — "
+            "autotuning messages will stay suppressed."
+        )
+        return
+
+    knobs.autotuning.print = True
+
+
+# ------------------------------------------------------------------
+# Triton JIT compilation hook
+# ------------------------------------------------------------------
+
+
+def _setup_triton_jit_hook() -> None:
+    """Register a ``jit_post_compile_hook`` that warns on compilation."""
+    if not HAS_TRITON:
+        return
+    from triton import knobs  # type: ignore[import-untyped]
+
+    existing_hook = knobs.runtime.jit_post_compile_hook
+
+    def _on_jit_compile(**kwargs):
+        # `jit_post_compile_hook` is Triton internal API and its
+        # signature has changed across releases (kwargs added/renamed).
+        # Accept **kwargs so an upstream change cannot crash this hook
+        # with TypeError, and forward the full kwarg set to any
+        # pre-existing hook unchanged.
+        fn = kwargs.get("fn")
+        fn_name = getattr(fn, "name", "<unknown>")
+        logger.warning_once(
+            "Triton kernel JIT compilation during inference: %s. "
+            "This causes a latency spike; consider extending warmup "
+            "to cover this shape/config.",
+            fn_name,
+        )
+        if existing_hook is not None:
+            return existing_hook(**kwargs)
+        return None
+
+    knobs.runtime.jit_post_compile_hook = _on_jit_compile
diff --git a/vllm/utils/cpu_resource_utils.py b/vllm/utils/cpu_resource_utils.py
index bbf554d0ccdd..6baf84266195 100644
--- a/vllm/utils/cpu_resource_utils.py
+++ b/vllm/utils/cpu_resource_utils.py
@@ -3,8 +3,8 @@
 
 import json
 import os
-import platform
 import subprocess
+import sys
 from dataclasses import dataclass
 from functools import cache
 
@@ -78,7 +78,7 @@ def parse_id_list(raw_str: str) -> list[int]:
 
 
 def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         # MacOS has no memory node
         return MemoryNodeInfo(
             total_memory=psutil.virtual_memory().total,
@@ -122,17 +122,14 @@ def get_memory_node_info(node_id: int = 0) -> MemoryNodeInfo:
 
 def get_allowed_cpu_list() -> list[LogicalCPUInfo]:
     cpu_list = _get_cpu_list()
-    if platform.system() == "Darwin":
-        return cpu_list
-
-    global_allowed_cpu_id_list = os.sched_getaffinity(0)
-    logical_cpu_list = [x for x in cpu_list if x.id in global_allowed_cpu_id_list]
-
-    return logical_cpu_list
+    if sys.platform == "linux":
+        allowed = os.sched_getaffinity(0)
+        return [x for x in cpu_list if x.id in allowed]
+    return cpu_list
 
 
 def get_visible_memory_node() -> list[int]:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         return [0]
 
     allowed_memory_node_list = get_memory_affinity()
@@ -163,7 +160,7 @@ def _synthesize_cpu_list() -> list[LogicalCPUInfo]:
 
 
 def _get_cpu_list() -> list[LogicalCPUInfo]:
-    if platform.system() == "Darwin":
+    if sys.platform == "darwin":
         # For MacOS, no user-level CPU affinity and SMT, return all CPUs
         return _synthesize_cpu_list()
 
diff --git a/vllm/utils/ompmultiprocessing.py b/vllm/utils/ompmultiprocessing.py
index c3c607ea90b4..6e7b8c3496c9 100644
--- a/vllm/utils/ompmultiprocessing.py
+++ b/vllm/utils/ompmultiprocessing.py
@@ -40,12 +40,13 @@ def __init__(self, config: "VllmConfig"):
 
         assert not (self.use_iomp and self.use_gomp)
 
-        # at least reserve 1/local_world_size(for ARM) core for scheduler
+        # at least reserve 1/local_world_size(for ARM/RISC-V) core for scheduler
         # proc as always use MP executor
         # TODO: make scheduler proc sleep when idle
         self.reserve_cpu_num = (
             self.local_world_size
-            if current_platform.get_cpu_architecture() == CpuArchEnum.ARM
+            if current_platform.get_cpu_architecture()
+            in (CpuArchEnum.ARM, CpuArchEnum.RISCV)
             else 1
         )
         # reserve at one more core for nixl_connector under p/d case
@@ -140,8 +141,8 @@ def _parse_omp_threads_bind_env(self):
                 cpu_list, reserve_list = self._get_autobind_cpu_ids(
                     lambda cpus: cpus[-1:]
                 )
-            elif cpu_arch == CpuArchEnum.ARM:
-                # For AArch64, no SMT, use all logical CPU
+            elif cpu_arch in (CpuArchEnum.ARM, CpuArchEnum.RISCV):
+                # For AArch64 / RISC-V, no SMT, use all logical CPUs
                 cpu_list, reserve_list = self._get_autobind_cpu_ids(lambda cpus: cpus)
             else:
                 cpu_list, reserve_list = [], []
@@ -173,9 +174,15 @@ def _parse_omp_threads_bind_env(self):
             # skip
             self.cpu_lists = []
 
-        msg = "OpenMP thread binding info: \n"
-        for i in range(self.local_world_size):
-            msg += f"\tlocal_rank={i}, core ids={self.cpu_lists[i]}\n"
+        msg = (
+            "OpenMP thread binding info: \n"
+            f"\tVLLM_CPU_OMP_THREADS_BIND={vllm_mask!r}, "
+            f"auto_setup={self.auto_setup}, skip_setup={self.skip_setup}\n"
+            f"\tlocal_world_size={self.local_world_size}, "
+            f"reserve_cpu_num={self.reserve_cpu_num}\n"
+        )
+        for i, cpus in enumerate(self.cpu_lists):
+            msg += f"\tlocal_rank={i}, core ids={cpus}\n"
         msg += f"\treserved_cpus={self.reserved_cpu_list}"
         logger.info(msg)
 
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 1eb9306ed4b1..12ec5b0fcc66 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -3,6 +3,7 @@
 import contextlib
 import importlib.metadata
 import os
+import platform
 import random
 import threading
 from collections.abc import Callable, Collection
@@ -67,6 +68,11 @@
 T = TypeVar("T")
 
 
+# Pin memory in non-WSL case.
+# Logic duplicated here for now to avoid circular import.
+PIN_MEMORY = "microsoft" not in " ".join(platform.uname()).lower()
+
+
 def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
     return (
         kv_cache_dtype.startswith("fp8")
@@ -110,6 +116,32 @@ def is_strictly_contiguous(t: torch.Tensor) -> bool:
     return True
 
 
+def canonicalize_singleton_dim_strides(t: torch.Tensor) -> torch.Tensor:
+    """Fix degenerate strides on size=1 dimensions for CUDA TMA compatibility.
+
+    PyTorch allows any stride on a size=1 dim (is_contiguous() is always True
+    there), so a size=1 dim may have stride=1 (2 bytes for bf16) instead of
+    the canonical product(shape[i+1:]).  CUDA TMA on H100+ requires all
+    non-outermost strides to be ≥16-byte aligned; stride=1 triggers
+    cudaErrorIllegalInstruction.  Zero-copy: patches stride metadata only via
+    as_strided; returns t unchanged if all size=1 strides are already canonical.
+    """
+    if 1 not in t.shape:
+        return t
+    strides = list(t.stride())
+    shape = t.shape
+    prev_stride = 1
+    changed = False
+    for i in range(len(shape) - 1, -1, -1):
+        if shape[i] == 1 and strides[i] != prev_stride:
+            strides[i] = prev_stride
+            changed = True
+        prev_stride = strides[i] * shape[i]
+    if not changed:
+        return t
+    return t.as_strided(t.shape, strides)
+
+
 @contextlib.contextmanager
 def set_default_torch_dtype(dtype: torch.dtype):
     """Sets the default torch dtype to the given dtype."""
@@ -576,12 +608,12 @@ def create_kv_caches_with_random(
 def async_tensor_h2d(
     data: list,
     dtype: torch.dtype,
-    target_device: str | torch.device,
-    pin_memory: bool,
+    device: str | torch.device,
+    pin_memory: bool = PIN_MEMORY,
 ) -> torch.Tensor:
     """Asynchronously create a tensor and copy it from host to device."""
     t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
-    return t.to(device=target_device, non_blocking=True)
+    return t.to(device=device, non_blocking=True)
 
 
 def make_ndarray_with_pad(
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index dd827ea5b3f9..51e3e865d52f 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -21,6 +21,7 @@
     AttentionMetadataBuilder,
     AttentionType,
     CommonAttentionMetadata,
+    MultipleOf,
 )
 from vllm.v1.attention.backends.utils import (
     split_decodes_and_prefills,
@@ -29,7 +30,12 @@
 
 logger = init_logger(__name__)
 
-_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM, CpuArchEnum.S390X)
+_CPU_ARCH_PREFER_MIXED_BATCH = (
+    CpuArchEnum.X86,
+    CpuArchEnum.ARM,
+    CpuArchEnum.S390X,
+    CpuArchEnum.POWERPC,
+)
 
 
 class CPUAttentionBackend(AttentionBackend):
@@ -45,6 +51,10 @@ class CPUAttentionBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [32, 64, 80, 96, 112, 128, 160, 192, 224, 256, 512]
@@ -510,8 +520,10 @@ def _get_attn_isa(
             )
         return "vec16"
     supports_amx = torch.cpu._is_amx_tile_supported()
-    supports_arm = current_platform.get_cpu_architecture() == CpuArchEnum.ARM
-    supports_vxe = current_platform.get_cpu_architecture() == CpuArchEnum.S390X
+    arch = current_platform.get_cpu_architecture()
+    supports_arm = arch == CpuArchEnum.ARM
+    supports_vxe = arch == CpuArchEnum.S390X
+    supports_vsx = arch == CpuArchEnum.POWERPC
     supports_avx512 = torch.cpu._is_avx512_supported()
     if fp8_kv and not supports_amx and not supports_avx512:
         raise NotImplementedError(
@@ -525,6 +537,8 @@ def _get_attn_isa(
             return "neon"
         elif supports_vxe:
             return "vxe"
+        elif supports_vsx:
+            return "vsx"
         else:
             return "vec"
     else:
diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
index b3a4a0c76ebd..1e74e4c48eda 100644
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -115,22 +115,29 @@ def get_flash_attn_version(
             )
             fa_version = 2
 
-        # The FA3 kernel rejects s_aux (sinks) when hdim != hdim_v; upgrade to
-        # FA4 on SM90 when available.
+        # Some FA3 unsupported SM90 cases can use FA4 when available.
         if (
             fa_version == 3
-            and has_sinks
-            and head_size is not None
-            and head_size_v is not None
-            and head_size != head_size_v
             and device_capability.major == 9
             and is_fa_version_supported(4)
         ):
-            logger.info_once(
-                "Diff-KV with sinks: upgrading FlashAttention 3 -> 4",
-                scope="local",
-            )
-            fa_version = 4
+            upgrade_reason = None
+            if head_size is not None and head_size > 256:
+                upgrade_reason = f"FA3 does not support head_size={head_size} on SM90"
+            elif (
+                has_sinks
+                and head_size is not None
+                and head_size_v is not None
+                and head_size != head_size_v
+            ):
+                upgrade_reason = "Diff-KV with sinks"
+            if upgrade_reason:
+                logger.info_once(
+                    "%s: upgrading FlashAttention 3 -> 4",
+                    upgrade_reason,
+                    scope="local",
+                )
+                fa_version = 4
 
         # FA4 currently uses batch-shape-dependent scheduling
         # heuristics on SM100+, which breaks batch invariance.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1c9ff3f79e43..4b8b86d864be 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -11,7 +11,10 @@
 
 from vllm.model_executor.layers.attention import Attention
 from vllm.platforms import current_platform
-from vllm.utils.torch_utils import is_quantized_kv_cache
+from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
+    is_quantized_kv_cache,
+)
 from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionImpl,
@@ -635,14 +638,6 @@ def __init__(
             requires_alibi=alibi_slopes is not None,
             head_size=head_size,
         )
-        # head_size > 256 requires FA4 on SM90+; force upgrade from FA3
-        if (
-            head_size > 256
-            and self.vllm_flash_attn_version == 3
-            and current_platform.is_cuda()
-            and current_platform.is_device_capability_family(90)
-        ):
-            self.vllm_flash_attn_version = 4
         logger.info_once(
             "Using FlashAttention version %s",
             self.vllm_flash_attn_version,
@@ -747,6 +742,23 @@ def forward(
 
         # For decoder and cross-attention, use KV cache as before
         key_cache, value_cache = kv_cache.unbind(0)
+        # Fix degenerate strides on size-1 dims (e.g. num_kv_heads=1 with TP).
+        # FA3/4 on H100+ uses TMA, which requires ≥16-byte stride alignment.
+        # See vllm.utils.torch_utils.canonicalize_singleton_dim_strides.
+        fixed_k = canonicalize_singleton_dim_strides(key_cache)
+        fixed_v = canonicalize_singleton_dim_strides(value_cache)
+        if fixed_k is not key_cache or fixed_v is not value_cache:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashAttention): "
+                "shape=%s, key strides before=%s after=%s, "
+                "value strides before=%s after=%s",
+                key_cache.shape,
+                key_cache.stride(),
+                fixed_k.stride(),
+                value_cache.stride(),
+                fixed_v.stride(),
+            )
+        key_cache, value_cache = fixed_k, fixed_v
 
         if is_quantized_kv_cache(self.kv_cache_dtype):
             # queries are quantized in the attention layer
@@ -861,6 +873,8 @@ def do_kv_cache_update(
             # we use direct Q, K, V tensors without caching
             return
 
+        # Scatter write into the KV cache using slot_mapping indices.
+        # No TMA kernel is invoked here, so stride canonicalization is not needed.
         key_cache, value_cache = kv_cache.unbind(0)
 
         # Reshape the input keys and values and store them in the cache.
diff --git a/vllm/v1/attention/backends/flash_attn_diffkv.py b/vllm/v1/attention/backends/flash_attn_diffkv.py
index d18054769711..82a9f07a4e59 100644
--- a/vllm/v1/attention/backends/flash_attn_diffkv.py
+++ b/vllm/v1/attention/backends/flash_attn_diffkv.py
@@ -4,7 +4,11 @@
 
 import torch
 
-from vllm.utils.torch_utils import is_quantized_kv_cache
+from vllm.logger import init_logger
+from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
+    is_quantized_kv_cache,
+)
 from vllm.v1.attention.backend import AttentionType
 from vllm.v1.attention.backends.fa_utils import (
     get_flash_attn_version,
@@ -25,6 +29,8 @@
     cascade_attention,
 )
 
+logger = init_logger(__name__)
+
 
 class FlashAttentionDiffKVBackend(FlashAttentionBackend):
     # Default to 128 for this backend
@@ -204,6 +210,23 @@ def forward(
         # Different head_size for K and V
         key_cache = kv_cache[..., : self.head_size]
         value_cache = kv_cache[..., self.head_size :]
+        # Fix degenerate strides on size-1 dims (e.g. num_kv_heads=1 with TP).
+        # FA3/4 on H100+ uses TMA, which requires ≥16-byte stride alignment.
+        # See vllm.utils.torch_utils.canonicalize_singleton_dim_strides.
+        fixed_k = canonicalize_singleton_dim_strides(key_cache)
+        fixed_v = canonicalize_singleton_dim_strides(value_cache)
+        if fixed_k is not key_cache or fixed_v is not value_cache:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashAttentionDiffKV): "
+                "shape=%s, key strides before=%s after=%s, "
+                "value strides before=%s after=%s",
+                key_cache.shape,
+                key_cache.stride(),
+                fixed_k.stride(),
+                value_cache.stride(),
+                fixed_v.stride(),
+            )
+        key_cache, value_cache = fixed_k, fixed_v
 
         if is_quantized_kv_cache(self.kv_cache_dtype):
             # queries are quantized in the attention layer
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 8f5cb6206bd0..7c1a784888eb 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -43,6 +43,7 @@
 from vllm.utils.math_utils import cdiv
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.utils.torch_utils import (
+    canonicalize_singleton_dim_strides,
     is_quantized_kv_cache,
     is_strictly_contiguous,
     nvfp4_kv_cache_full_dim,
@@ -1104,7 +1105,8 @@ def build(
                 paged_kv_indptr_prefill_gpu = self.paged_kv_indptr.gpu[
                     prefill_start : num_reqs + 1
                 ]
-                paged_kv_indptr_prefill_gpu[0] = 0
+                # Assign to slice to avoid cpu sync.
+                paged_kv_indptr_prefill_gpu[:1] = 0
                 torch.cumsum(
                     num_blocks_per_req,
                     dim=0,
@@ -1479,6 +1481,21 @@ def forward(
 
         stride_order = FlashInferBackend.get_kv_cache_stride_order()
         kv_cache_permute = kv_cache.permute(*stride_order)  # HND and contiguous
+        # Fix degenerate strides on any size-1 dimension (e.g. num_kv_heads=1
+        # with TP=8).  PyTorch permits non-canonical strides on size-1 dims;
+        # CUDA TMA requires ≥16-byte alignment on all non-outermost strides.
+        # canonicalize_singleton_dim_strides patches metadata via as_strided —
+        # zero-copy.  See vllm.utils.torch_utils.
+        fixed = canonicalize_singleton_dim_strides(kv_cache_permute)
+        if fixed is not kv_cache_permute:
+            logger.debug(
+                "Canonicalized degenerate KV cache strides (FlashInfer): "
+                "shape=%s, strides before=%s, strides after=%s",
+                kv_cache_permute.shape,
+                kv_cache_permute.stride(),
+                fixed.stride(),
+            )
+        kv_cache_permute = fixed
 
         # For NVFP4, the kv_cache last dim is full_dim (data + scale packed).
         # Split into correctly-strided data and scale views.
@@ -1568,10 +1585,11 @@ def forward(
             else:
                 assert isinstance(attn_metadata.prefill, TRTLLMPrefill)
                 # prefill_query may be non-contiguous or have degenerate strides
-                # First ensure memory contiguity, then fix degenerate strides
-                # with reshape. contiguous() alone doesn't fix degenerate
-                # strides when a dimension has size 1.
-                prefill_query = prefill_query.contiguous().reshape(prefill_query.shape)
+                # on size=1 dims. contiguous() ensures memory layout; then
+                # canonicalize_singleton_dim_strides fixes any remaining
+                # degenerate strides on size=1 dims for TMA alignment.
+                prefill_query = prefill_query.contiguous()
+                prefill_query = canonicalize_singleton_dim_strides(prefill_query)
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
                 block_tables_prefill = attn_metadata.prefill.block_tables
                 seq_lens_prefill = attn_metadata.prefill.seq_lens
@@ -1621,11 +1639,9 @@ def forward(
                     # with fp8 kv cache, we can construct a mock block
                     # and mock kv cache with BF16 KV involved in the prefill
                     #
-                    # The inner (block_size, head_size) dims must be
-                    # contiguous; outer dims may have non-canonical strides
-                    # (e.g. cross-layer unified allocation).
-                    # Degenerate strides on outer dims break TMA descriptors
-                    # (see flashinfer-ai/flashinfer#2232).
+                    kv_cache_permute = canonicalize_singleton_dim_strides(
+                        kv_cache_permute
+                    )
                     kv_strides = kv_cache_permute.stride()
                     assert (
                         kv_strides[-1] == 1
@@ -1732,12 +1748,13 @@ def forward(
                 if needs_fp8_out:
                     output[:num_decode_tokens].copy_(out_decode.to(output.dtype))
             else:
-                # decode_query may be non-contiguous or have degenerate strides
                 assert isinstance(attn_metadata.decode, TRTLLMDecode)
-                # First ensure memory contiguity, then fix degenerate strides
-                # with reshape. contiguous() alone doesn't fix degenerate
-                # strides when a dimension has size 1.
-                decode_query = decode_query.contiguous().reshape(decode_query.shape)
+                # decode_query may be non-contiguous or have degenerate strides
+                # on size=1 dims. contiguous() ensures memory layout; then
+                # canonicalize_singleton_dim_strides fixes any remaining
+                # degenerate strides on size=1 dims for TMA alignment.
+                decode_query = decode_query.contiguous()
+                decode_query = canonicalize_singleton_dim_strides(decode_query)
                 workspace_buffer = _get_trtllm_gen_workspace_buffer()
                 block_tables_decode = attn_metadata.decode.block_tables
                 seq_lens_decode = attn_metadata.decode.seq_lens
@@ -1748,11 +1765,7 @@ def forward(
                 assert is_strictly_contiguous(workspace_buffer)
                 assert is_strictly_contiguous(block_tables_decode)
                 assert is_strictly_contiguous(seq_lens_decode)
-                # kv_cache outer dims may be non-contiguous (e.g.
-                # cross-layer unified allocation), but inner dims
-                # (block_size, head_size) must be contiguous and
-                # strides must be canonical to avoid TMA descriptor
-                # failures (see flashinfer-ai/flashinfer#2232).
+                kv_cache_permute = canonicalize_singleton_dim_strides(kv_cache_permute)
                 kv_strides = kv_cache_permute.stride()
                 assert (
                     kv_strides[-1] == 1 and kv_strides[-2] == kv_cache_permute.shape[-1]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 1de6eb408ae2..0ce5baf1a341 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -48,12 +48,16 @@
 flex_attention_compiled = torch.compile(flex_attention, fullgraph=True)
 
 
-def _offsets_to_doc_ids_tensor(offsets: torch.Tensor) -> torch.Tensor:
-    device = offsets.device
-    counts = offsets[1:] - offsets[:-1]
-    return torch.repeat_interleave(
-        torch.arange(len(counts), device=device, dtype=torch.int32), counts
+def _offsets_to_doc_ids_tensor(
+    offsets_cpu: torch.Tensor, device: torch.device
+) -> torch.Tensor:
+    # Build on CPU (so `repeat_interleave` doesn't force a GPU->CPU sync to
+    # learn the data-dependent output length) and upload non-blocking.
+    counts = offsets_cpu[1:] - offsets_cpu[:-1]
+    doc_ids = torch.repeat_interleave(
+        torch.arange(len(counts), dtype=torch.int32), counts
     )
+    return doc_ids.to(device, non_blocking=True)
 
 
 def pad_to_multiple(x: torch.Tensor, multiple: int, dim: int):
@@ -290,11 +294,13 @@ def unique_static_unsorted(
     keep = (x_flat != ignored_val) & (idx == first_idx.gather(1, x_flat))  # [B, N]
 
     # ── left-pack uniques into a fresh tensor ───────────────────────────
+    # Route non-kept entries to a garbage slot at column N so we can do a
+    # single scatter rather than using torch.nonzero (which would force a
+    # GPU->CPU sync to enumerate kept positions).
     dest_pos = torch.cumsum(keep.to(torch.long), dim=1) - 1  # where to go
-    packed_flat = torch.full_like(x_flat, pad_val)
-
-    rows, src_cols = torch.nonzero(keep, as_tuple=True)
-    packed_flat[rows, dest_pos[rows, src_cols]] = x_flat[rows, src_cols]
+    dest_pos = torch.where(keep, dest_pos, N)
+    packed_extended = torch.full((B, N + 1), pad_val, device=device, dtype=x_flat.dtype)
+    packed_flat = packed_extended.scatter_(1, dest_pos, x_flat)[:, :N]
 
     # ── restore original layout ─────────────────────────────────────────
     packed = packed_flat.reshape(x_perm.shape).movedim(-1, dim)
@@ -346,6 +352,9 @@ class FlexAttentionMetadata:
     num_actual_tokens: int  # Number of tokens excluding padding.
     max_query_len: int
     query_start_loc: torch.Tensor
+    # CPU-resident copy of query_start_loc used to derive doc_ids without a
+    # GPU->CPU sync from repeat_interleave's data-dependent output size.
+    query_start_loc_cpu: torch.Tensor
     max_seq_len: int
     seq_lens: torch.Tensor
     block_table: torch.Tensor
@@ -452,12 +461,7 @@ def final_mask_mod(
             (is_valid, logical_q_idx, logical_kv_idx) = (
                 self._convert_physical_to_logical(self.doc_ids, q_idx, physical_kv_idx)
             )
-            # Apply mask modification only for valid indices
-            return torch.where(
-                is_valid,
-                self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx),
-                False,
-            )
+            return is_valid & self.logical_mask_mod(b, h, logical_q_idx, logical_kv_idx)
 
         return final_mask_mod
 
@@ -469,7 +473,9 @@ def get_bidirectional_mask_mod(self) -> _mask_mod_signature:
         packed query sequences.
         """
         # Create a lookup mapping from query indices -> request number
-        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+        request_lookup = _offsets_to_doc_ids_tensor(
+            self.query_start_loc_cpu, self.query_start_loc.device
+        )
 
         def final_mask_mod(
             b: torch.Tensor,
@@ -581,7 +587,9 @@ def get_transformed_score_mod(self) -> _score_mod_signature | None:
             return None
 
         # Create a lookup mapping from query indices -> request number
-        request_lookup = _offsets_to_doc_ids_tensor(self.query_start_loc)
+        request_lookup = _offsets_to_doc_ids_tensor(
+            self.query_start_loc_cpu, self.query_start_loc.device
+        )
         user_score_mod = self.score_mod
 
         def transformed_score_mod(
@@ -726,7 +734,9 @@ def __post_init__(self):
         assert self.prefix_kv_lens is None, "Not implemented yet."
         assert self.suffix_kv_lens is None, "Not implemented yet."
         # Create a lookup mapping from query indices -> request number
-        self.doc_ids = _offsets_to_doc_ids_tensor(self.query_start_loc)
+        self.doc_ids = _offsets_to_doc_ids_tensor(
+            self.query_start_loc_cpu, self.query_start_loc.device
+        )
         self.doc_ids = copy_to_persistent(self.persistent_doc_ids, self.doc_ids)
         self.num_blocks = self.total_cache_tokens // self.block_size
 
@@ -807,6 +817,7 @@ def build(
 
         max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
@@ -871,6 +882,7 @@ def build(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
             query_start_loc=query_start_loc,
+            query_start_loc_cpu=query_start_loc_cpu,
             max_seq_len=max_seq_len,
             seq_lens=seq_lens,
             block_table=block_table_tensor,
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index eec53032288d..716dfcde592f 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -9,6 +9,7 @@
 
 from vllm.config import VllmConfig
 from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import async_tensor_h2d
 from vllm.v1.attention.backend import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -270,16 +271,20 @@ def _build_chunk_metadata_tensors(
         num_prefills = common.num_prefills
         num_decode_tokens = common.num_decode_tokens
 
-        num_computed_tokens_cpu = (
-            common_attn_metadata.compute_num_computed_tokens().cpu()
-        )
-        num_computed_tokens_p_cpu = num_computed_tokens_cpu[
-            num_reqs - num_prefills : num_reqs
-        ]
+        # Derive prefill context lengths from CPU data only.
+        # `seq_lens_cpu_upper_bound` is precise for prefill rows in all modes
+        # (including async spec decode), so this avoids the D2H sync that
+        # `compute_num_computed_tokens().cpu()` would force.
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu_upper_bound
+        assert seq_lens_cpu is not None
         query_start_loc_p_cpu = (
             common_attn_metadata.query_start_loc_cpu[-num_prefills - 1 :]
             - num_decode_tokens
         )
+        prefill_query_lens_cpu = query_start_loc_p_cpu[1:] - query_start_loc_p_cpu[:-1]
+        num_computed_tokens_p_cpu = (
+            seq_lens_cpu[num_reqs - num_prefills : num_reqs] - prefill_query_lens_cpu
+        )
 
         cu_chunk_seqlen, seq_idx, last_chunk_indices = self._compute_chunk_metadata(
             chunk_size,
@@ -289,20 +294,14 @@ def _build_chunk_metadata_tensors(
         )
 
         device = common_attn_metadata.query_start_loc.device
-        cu_chunk_seqlen_p = torch.as_tensor(
-            cu_chunk_seqlen,
-            device=device,
-            dtype=torch.int32,
-        )
-        seq_idx_p = torch.as_tensor(
-            seq_idx,
-            device=device,
-            dtype=torch.int32,
+        # Build on pinned CPU and upload non-blocking to avoid the synchronous
+        # H2D copy that `torch.as_tensor(list, device=cuda)` would force.
+        cu_chunk_seqlen_p = async_tensor_h2d(
+            cu_chunk_seqlen, dtype=torch.int32, device=device
         )
-        last_chunk_indices_p = torch.as_tensor(
-            last_chunk_indices,
-            device=device,
-            dtype=torch.int32,
+        seq_idx_p = async_tensor_h2d(seq_idx, dtype=torch.int32, device=device)
+        last_chunk_indices_p = async_tensor_h2d(
+            last_chunk_indices, dtype=torch.int32, device=device
         )
         return cu_chunk_seqlen_p, seq_idx_p, last_chunk_indices_p
 
diff --git a/vllm/v1/attention/backends/mla/prefill/base.py b/vllm/v1/attention/backends/mla/prefill/base.py
index 9c850a0b1d99..91d668826fd9 100644
--- a/vllm/v1/attention/backends/mla/prefill/base.py
+++ b/vllm/v1/attention/backends/mla/prefill/base.py
@@ -81,8 +81,6 @@ def __init__(
         qk_rope_head_dim: int,
         v_head_dim: int,
         vllm_config: "VllmConfig",
-        device: torch.device,
-        layer_names: list[str] | None = None,
     ) -> None:
         self.num_heads = num_heads
         self.scale = scale
@@ -91,8 +89,6 @@ def __init__(
         self.qk_rope_head_dim = qk_rope_head_dim
         self.v_head_dim = v_head_dim
         self.vllm_config = vllm_config
-        self.device = device
-        self.layer_names = layer_names
 
     def prepare_metadata(  # noqa: B027
         self,
diff --git a/vllm/v1/attention/backends/mla/prefill/flash_attn.py b/vllm/v1/attention/backends/mla/prefill/flash_attn.py
index 42d77b12d0de..029bd8ec9560 100644
--- a/vllm/v1/attention/backends/mla/prefill/flash_attn.py
+++ b/vllm/v1/attention/backends/mla/prefill/flash_attn.py
@@ -44,8 +44,6 @@ def __init__(
         qk_rope_head_dim: int,
         v_head_dim: int,
         vllm_config: "VllmConfig",
-        device: torch.device,
-        layer_names: list[str] | None = None,
     ) -> None:
         super().__init__(
             num_heads=num_heads,
@@ -55,8 +53,6 @@ def __init__(
             qk_rope_head_dim=qk_rope_head_dim,
             v_head_dim=v_head_dim,
             vllm_config=vllm_config,
-            device=device,
-            layer_names=layer_names,
         )
 
         # Handle the differences between the flash_attn_varlen from
diff --git a/vllm/v1/attention/backends/mla/prefill/flashinfer.py b/vllm/v1/attention/backends/mla/prefill/flashinfer.py
index a1107df304f3..77199fb5238a 100644
--- a/vllm/v1/attention/backends/mla/prefill/flashinfer.py
+++ b/vllm/v1/attention/backends/mla/prefill/flashinfer.py
@@ -9,6 +9,7 @@
 import vllm.envs as envs
 from vllm.v1.attention.backends.mla.prefill.base import MLAPrefillBackend
 from vllm.v1.attention.backends.utils import (
+    PerLayerParameters,
     get_per_layer_parameters,
     infer_global_hyperparameters,
 )
@@ -62,8 +63,6 @@ def __init__(
         qk_rope_head_dim: int,
         v_head_dim: int,
         vllm_config: "VllmConfig",
-        device: torch.device,
-        layer_names: list[str] | None = None,
     ) -> None:
         super().__init__(
             num_heads=num_heads,
@@ -73,25 +72,11 @@ def __init__(
             qk_rope_head_dim=qk_rope_head_dim,
             v_head_dim=v_head_dim,
             vllm_config=vllm_config,
-            device=device,
-            layer_names=layer_names,
         )
 
         self._prefill_main: BatchPrefillWithRaggedKVCacheWrapper | None = None
         self._prefill_chunks: list[BatchPrefillWithRaggedKVCacheWrapper] = []
-        if layer_names is None:
-            raise ValueError(
-                "FlashInferPrefillBackend requires layer_names to "
-                "initialize global hyperparameters."
-            )
-
-        from vllm.model_executor.layers.attention.mla_attention import (
-            MLACommonImpl,
-        )
-
-        self._global_hyperparameters = infer_global_hyperparameters(
-            get_per_layer_parameters(vllm_config, layer_names, MLACommonImpl)  # type: ignore[type-abstract]
-        )
+        self._global_hyperparameters: PerLayerParameters | None = None
 
     def _ensure_chunks(
         self,
@@ -106,10 +91,36 @@ def _ensure_chunks(
                     )
                 )
 
+    def _resolve_global_hyperparameters(self) -> PerLayerParameters:
+        if self._global_hyperparameters is not None:
+            return self._global_hyperparameters
+
+        from vllm.model_executor.layers.attention.mla_attention import (
+            MLAAttention,
+            MLACommonImpl,
+        )
+
+        forward_context = self.vllm_config.compilation_config.static_forward_context
+        layer_names = [
+            name
+            for name, layer in forward_context.items()
+            if isinstance(layer, MLAAttention)
+        ]
+
+        self._global_hyperparameters = infer_global_hyperparameters(
+            get_per_layer_parameters(
+                self.vllm_config,
+                layer_names,
+                MLACommonImpl,  # type: ignore[type-abstract]
+            )
+        )
+        return self._global_hyperparameters
+
     def prepare_metadata(
         self,
         prefill_metadata: "MLACommonPrefillMetadata",
     ) -> None:
+        global_hyperparameters = self._resolve_global_hyperparameters()
         qo_indptr = prefill_metadata.query_start_loc
         has_context = prefill_metadata.chunked_context is not None
         (workspace_buffer,) = current_workspace_manager().get_simultaneous(
@@ -144,9 +155,9 @@ def prepare_metadata(
             head_dim_qk=head_dim_qk,
             head_dim_vo=head_dim_vo,
             causal=True,
-            sm_scale=self._global_hyperparameters.sm_scale,
-            window_left=self._global_hyperparameters.window_left,
-            logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
+            sm_scale=global_hyperparameters.sm_scale,
+            window_left=global_hyperparameters.window_left,
+            logits_soft_cap=global_hyperparameters.logits_soft_cap,
             q_data_type=prefill_metadata.q_data_type,
             o_data_type=prefill_metadata.output_dtype,
         )
@@ -165,9 +176,9 @@ def prepare_metadata(
                     head_dim_qk=head_dim_qk,
                     head_dim_vo=head_dim_vo,
                     causal=False,
-                    sm_scale=self._global_hyperparameters.sm_scale,
-                    window_left=self._global_hyperparameters.window_left,
-                    logits_soft_cap=self._global_hyperparameters.logits_soft_cap,
+                    sm_scale=global_hyperparameters.sm_scale,
+                    window_left=global_hyperparameters.window_left,
+                    logits_soft_cap=global_hyperparameters.logits_soft_cap,
                     q_data_type=prefill_metadata.q_data_type,
                     o_data_type=prefill_metadata.output_dtype,
                 )
diff --git a/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py b/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py
index 2ffe244ebe0d..7462dc39e7bc 100644
--- a/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py
+++ b/vllm/v1/attention/backends/mla/prefill/trtllm_ragged.py
@@ -51,8 +51,6 @@ def __init__(
         qk_rope_head_dim: int,
         v_head_dim: int,
         vllm_config: "VllmConfig",
-        device: torch.device,
-        layer_names: list[str] | None = None,
     ) -> None:
         super().__init__(
             num_heads=num_heads,
@@ -62,8 +60,6 @@ def __init__(
             qk_rope_head_dim=qk_rope_head_dim,
             v_head_dim=v_head_dim,
             vllm_config=vllm_config,
-            device=device,
-            layer_names=layer_names,
         )
 
     def _get_workspace_buffer(self) -> torch.Tensor:
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 2106226118ef..f16fd5dc14f0 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -396,7 +396,7 @@ class AiterMLAHelper:
     """
 
     _AITER_MIN_MLA_HEADS: Final = 16
-    _AITER_UNSUPPORTED_HEADS = [32]
+    _AITER_UNSUPPORTED_HEADS: ClassVar[tuple[int, ...]] = ()
 
     @staticmethod
     def check_num_heads_validity(num_heads: int):
@@ -420,9 +420,6 @@ def get_actual_mla_num_heads(num_heads: int) -> int:
 
     @staticmethod
     def get_mla_padded_q(num_heads: int, q: torch.Tensor) -> torch.Tensor:
-        assert num_heads not in AiterMLAHelper._AITER_UNSUPPORTED_HEADS, (
-            f"unsupported head_num: {num_heads}"
-        )
         return (
             q
             if num_heads >= AiterMLAHelper._AITER_MIN_MLA_HEADS
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
index dc343b639f6c..dd25e721d33e 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -332,6 +332,15 @@ class ROCMAiterMLASparseMetadata(AttentionMetadata):
     block_size: int = 1
     topk_tokens: int = 2048
 
+    # Persistent MLA metadata (only populated when persistent mode is enabled,
+    # i.e. when the aiter sparse decode kernel supports work-stealing splits).
+    work_meta_data: torch.Tensor | None = None
+    work_indptr: torch.Tensor | None = None
+    work_info_set: torch.Tensor | None = None
+    reduce_indptr: torch.Tensor | None = None
+    reduce_final_map: torch.Tensor | None = None
+    reduce_partial_map: torch.Tensor | None = None
+
 
 @dataclass
 class ROCMAiterMLASparseMetadataBuilder(
@@ -389,6 +398,66 @@ def __init__(
             [max_num_batched_tokens + 1], dtype=torch.int32, device=device
         )
 
+        # ----- Persistent MLA metadata buffers -----
+        # The aiter sparse decode kernel supports a "persistent" path that
+        # uses precomputed work-splitting metadata for better load balancing
+        # across CUs. Mirrors the approach used in rocm_aiter_mla.py.
+        #
+        # In the sparse case each query token is its own "batch" entry in the
+        # qo_indptr (qo_indptr = [0, 1, 2, ..., num_tokens]) and max_qo_len=1.
+        # We pad get_mla_metadata_info_v1's batch_size to max_num_batched_tokens
+        # so the buffers are large enough for any decode shape we might see.
+        from aiter import dtypes, get_mla_metadata_info_v1
+
+        # Aiter sparse MLA also requires num_heads >= 16 (will be padded by
+        # AiterMLAHelper.get_mla_padded_q in forward).
+        self._num_attention_heads = max(16, self.num_heads)
+
+        q_dtype = self.model_dtype
+        kv_cache_dtype_str = getattr(vllm_config.cache_config, "cache_dtype", "auto")
+        if kv_cache_dtype_str in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+            kv_cache_dtype_str = "fp8"
+        else:
+            kv_cache_dtype_str = "bf16"
+        kv_dtype = dtypes.d_dtypes.get(kv_cache_dtype_str, dtypes.bf16)
+
+        (
+            (work_meta_data_size, work_meta_data_type),
+            (work_indptr_size, work_indptr_type),
+            (work_info_set_size, work_info_set_type),
+            (reduce_indptr_size, reduce_indptr_type),
+            (reduce_final_map_size, reduce_final_map_type),
+            (reduce_partial_map_size, reduce_partial_map_type),
+        ) = get_mla_metadata_info_v1(
+            max_num_batched_tokens,
+            1,
+            self._num_attention_heads,
+            q_dtype,
+            kv_dtype,
+            is_sparse=True,
+            fast_mode=True,
+        )
+        self._mla_work_meta_data = torch.empty(
+            work_meta_data_size, dtype=work_meta_data_type, device=device
+        )
+        self._mla_work_indptr = torch.empty(
+            work_indptr_size, dtype=work_indptr_type, device=device
+        )
+        self._mla_work_info_set = torch.empty(
+            work_info_set_size, dtype=work_info_set_type, device=device
+        )
+        self._mla_reduce_indptr = torch.empty(
+            reduce_indptr_size, dtype=reduce_indptr_type, device=device
+        )
+        self._mla_reduce_final_map = torch.empty(
+            reduce_final_map_size, dtype=reduce_final_map_type, device=device
+        )
+        self._mla_reduce_partial_map = torch.empty(
+            reduce_partial_map_size,
+            dtype=reduce_partial_map_type,
+            device=device,
+        )
+
     def build(
         self,
         common_prefix_len: int,
@@ -431,6 +500,33 @@ def build(
         paged_kv_indptr = self.paged_kv_indptr[: num_tokens + 1]
         paged_kv_indices = self.paged_kv_indices[: num_tokens * self.topk_tokens]
 
+        # ----- Compute persistent MLA metadata -----
+        # The aiter sparse decode kernel uses qseqlen=1 (each query token is
+        # treated as its own batch entry), so persistent metadata can always
+        # be precomputed here. The kernel switches to the persistent
+        # work-stealing path automatically when work_meta_data is non-None.
+        from aiter import get_mla_metadata_v1
+
+        get_mla_metadata_v1(
+            qo_indptr,
+            paged_kv_indptr,
+            paged_kv_last_page_len,
+            self._num_attention_heads,
+            1,
+            True,
+            self._mla_work_meta_data,
+            self._mla_work_info_set,
+            self._mla_work_indptr,
+            self._mla_reduce_indptr,
+            self._mla_reduce_final_map,
+            self._mla_reduce_partial_map,
+            page_size=1,
+            kv_granularity=16,
+            max_seqlen_qo=1,
+            uni_seqlen_qo=1,
+            fast_mode=True,
+        )
+
         metadata = ROCMAiterMLASparseMetadata(
             num_reqs=common_attn_metadata.num_reqs,
             max_query_len=common_attn_metadata.max_query_len,
@@ -447,6 +543,12 @@ def build(
             paged_kv_last_page_len=paged_kv_last_page_len,
             paged_kv_indices=paged_kv_indices,
             paged_kv_indptr=paged_kv_indptr,
+            work_meta_data=self._mla_work_meta_data,
+            work_indptr=self._mla_work_indptr,
+            work_info_set=self._mla_work_info_set,
+            reduce_indptr=self._mla_reduce_indptr,
+            reduce_final_map=self._mla_reduce_final_map,
+            reduce_partial_map=self._mla_reduce_partial_map,
         )
         return metadata
 
@@ -525,6 +627,23 @@ def _forward_mla(
             device=q.device,
         )
 
+        # Build kwargs and forward the persistent MLA metadata when it has
+        # been computed. The aiter mla_decode_fwd switches to its
+        # work-stealing persistent kernel path when work_meta_data is given.
+        mla_kwargs: dict = dict(
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
+        )
+        if attn_metadata.work_meta_data is not None:
+            mla_kwargs.update(
+                work_meta_data=attn_metadata.work_meta_data,
+                work_indptr=attn_metadata.work_indptr,
+                work_info_set=attn_metadata.work_info_set,
+                reduce_indptr=attn_metadata.reduce_indptr,
+                reduce_final_map=attn_metadata.reduce_final_map,
+                reduce_partial_map=attn_metadata.reduce_partial_map,
+            )
+
         rocm_aiter_ops.mla_decode_fwd(
             q,
             kv_c_and_k_pe_cache,
@@ -535,8 +654,7 @@ def _forward_mla(
             attn_metadata.paged_kv_indptr,
             attn_metadata.paged_kv_indices,
             attn_metadata.paged_kv_last_page_len,
-            q_scale=layer._q_scale,
-            kv_scale=layer._k_scale,
+            **mla_kwargs,
         )
 
         return AiterMLAHelper.get_mla_unpadded_o(self.num_heads, output)
diff --git a/vllm/v1/attention/backends/mla/sparse_swa.py b/vllm/v1/attention/backends/mla/sparse_swa.py
index b17fd5d34418..28564e6a97d3 100644
--- a/vllm/v1/attention/backends/mla/sparse_swa.py
+++ b/vllm/v1/attention/backends/mla/sparse_swa.py
@@ -7,6 +7,7 @@
 
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.v1.attention.backend import (
     AttentionBackend,
@@ -360,7 +361,7 @@ def build_tile_scheduler(
             _LAYER_TYPE_C4A: None,
             _LAYER_TYPE_C128A: None,
         }
-        if num_decode_tokens == 0:
+        if num_decode_tokens == 0 or current_platform.is_rocm():
             return out
         for layer_type in self._layer_types:
             # get_mla_metadata() is the official FlashMLA entry point that
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index 55ed5c5b3c49..f56b58c43e7f 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -200,19 +200,9 @@ def forward(
         key_cache, value_cache = kv_cache.unbind(0)
 
         softmax_scale = self.scale
-        fp8_post_attn_v_rescale = False
         if is_quantized_kv_cache(self.kv_cache_dtype):
             key_cache = key_cache.view(self.fp8_dtype)
             value_cache = value_cache.view(self.fp8_dtype)
-            # When Q is FP8, triton kernel skips K/V dequant (for fp8xfp8 matmul).
-            # Compensate by absorbing q_scale and k_scale into softmax_scale, and
-            # v_scale into output_scale (or post-multiplying if no fusion).
-            if query.dtype == self.fp8_dtype:
-                softmax_scale = self.scale * layer._q_scale_float * layer._k_scale_float
-                if output_scale is not None:
-                    output_scale = output_scale / layer._v_scale_float
-                else:
-                    fp8_post_attn_v_rescale = True
 
         cu_seqlens_q = attn_metadata.query_start_loc
         seqused_k = attn_metadata.seq_lens
@@ -220,11 +210,6 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (
-            cu_seqlens_q.shape[0] - 1,
-            key.shape[1] if key is not None else self.num_kv_heads,
-        )
-
         self.unified_attention(
             q=query[:num_actual_tokens],
             k=key_cache,
@@ -240,16 +225,13 @@ def forward(
             window_size=self.sliding_window,
             block_table=block_table,
             softcap=self.logits_soft_cap,
-            q_descale=None,  # q_scale absorbed into softmax_scale
-            k_descale=layer._k_scale.expand(descale_shape),
-            v_descale=layer._v_scale.expand(descale_shape),
+            q_descale=layer._q_scale if query.dtype == self.fp8_dtype else None,
+            k_descale=layer._k_scale,
+            v_descale=layer._v_scale,
             sinks=self.sinks,
             output_scale=output_scale,
         )
 
-        if fp8_post_attn_v_rescale:
-            output[:num_actual_tokens].mul_(layer._v_scale_float)
-
         return output
 
     def do_kv_cache_update(
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index ceee8d5499ea..af9c91d11ee2 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -90,6 +90,14 @@ class TreeAttentionMetadata:
     num_prefills: int = 0
     num_decodes: int = 0
 
+    # Precomputed (on CPU in the builder) max_query_len and max_seq_len for
+    # the prefill-only and decode-only sub-batches. Used by the properties
+    # below to avoid a GPU->CPU sync via `.max().item()` on every forward.
+    max_query_len_prefill: int = 0
+    max_seq_len_prefill: int = 0
+    max_query_len_decode: int = 0
+    max_seq_len_decode: int = 0
+
     tree_attn_bias: torch.Tensor | None = None
 
     # Cached Prefill/decode metadata.
@@ -107,14 +115,13 @@ def prefill_metadata(self) -> "TreeAttentionMetadata | None":
             return self._cached_prefill_metadata
 
         q_start_loc = self.query_start_loc[self.num_decodes :]
-        q_seqlens = torch.diff(q_start_loc)
         kv_seqlens = self.seq_lens[self.num_decodes :]
         # Construct & cache prefill-phase attention metadata structure
         self._cached_prefill_metadata = TreeAttentionMetadata(
             num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
+            max_query_len=self.max_query_len_prefill,
             query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
+            max_seq_len=self.max_seq_len_prefill,
             seq_lens=kv_seqlens,
             block_table=self.block_table[self.num_decodes :],
             slot_mapping=self.slot_mapping[self.num_decode_tokens :],
@@ -132,14 +139,13 @@ def decode_metadata(self) -> "TreeAttentionMetadata | None":
             return self._cached_decode_metadata
 
         q_start_loc = self.query_start_loc[: self.num_decodes + 1]
-        q_seqlens = torch.diff(q_start_loc)
         kv_seqlens = self.seq_lens[: self.num_decodes]
         # Construct & cache decode-phase attention metadata structure
         self._cached_decode_metadata = TreeAttentionMetadata(
             num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens.max().item()),
+            max_query_len=self.max_query_len_decode,
             query_start_loc=q_start_loc,
-            max_seq_len=int(kv_seqlens.max().item()),
+            max_seq_len=self.max_seq_len_decode,
             seq_lens=kv_seqlens,
             block_table=self.block_table[: self.num_decodes],
             slot_mapping=self.slot_mapping[: self.num_decode_tokens],
@@ -199,6 +205,42 @@ def build(
         block_table = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
 
+        # Precompute prefill/decode sub-batch max_query_len / max_seq_len on
+        # CPU so the prefill_metadata / decode_metadata properties don't need
+        # a GPU->CPU sync via `.max().item()` on every forward.
+        # Prefer `seq_lens_cpu_upper_bound` over the (deprecated)
+        # `seq_lens_cpu` property: the upper bound is precise for prefill
+        # rows and optimistic-but-safe for decode rows (workspace sizing
+        # from `max()` is fine with an over-estimate), and avoids the
+        # `seq_lens.to("cpu")` sync the property would fall through to in
+        # async-spec-decode mode. The draft-attention path (eagle
+        # speculator) doesn't populate it; fall back to the batch-wide
+        # `max_seq_len` as a safe upper bound for both sub-batches.
+        q_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu_upper_bound
+        if num_prefills > 0:
+            q_seqlens_p = torch.diff(q_start_loc_cpu[num_decodes:])
+            max_query_len_prefill = int(q_seqlens_p.max())
+            max_seq_len_prefill = (
+                int(seq_lens_cpu[num_decodes:].max())
+                if seq_lens_cpu is not None
+                else max_seq_len
+            )
+        else:
+            max_query_len_prefill = 0
+            max_seq_len_prefill = 0
+        if num_decodes > 0:
+            q_seqlens_d = torch.diff(q_start_loc_cpu[: num_decodes + 1])
+            max_query_len_decode = int(q_seqlens_d.max())
+            max_seq_len_decode = (
+                int(seq_lens_cpu[:num_decodes].max())
+                if seq_lens_cpu is not None
+                else max_seq_len
+            )
+        else:
+            max_query_len_decode = 0
+            max_seq_len_decode = 0
+
         return TreeAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             num_prefill_tokens=num_prefill_tokens,
@@ -211,6 +253,10 @@ def build(
             seq_lens=kv_seqlens,
             block_table=block_table,
             slot_mapping=slot_mapping,
+            max_query_len_prefill=max_query_len_prefill,
+            max_seq_len_prefill=max_seq_len_prefill,
+            max_query_len_decode=max_query_len_decode,
+            max_seq_len_decode=max_seq_len_decode,
             tree_attn_bias=self.tree_attn_bias,
         )
 
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index f254d95a414c..0aeb33ae6fd9 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -18,7 +18,7 @@
 from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import next_power_of_2
-from vllm.utils.torch_utils import is_quantized_kv_cache
+from vllm.utils.torch_utils import async_tensor_h2d, is_quantized_kv_cache
 from vllm.v1.attention.backend import (
     AttentionBackend,
     AttentionCGSupport,
@@ -117,10 +117,9 @@ def compute_mm_prefix_range_tensor(
         for r in range_lists:
             padded_r = list(r) + [(0, 0)] * (max_ranges - len(r))
             padded.append(padded_r)
-        # Create tensor with efficient H2D transfer
-        return torch.tensor(padded, dtype=torch.int32, device=device).view(
-            num_seqs, max_ranges, 2
-        )
+        # Build on pinned CPU memory so the H2D transfer is non-blocking.
+        padded = async_tensor_h2d(padded, dtype=torch.int32, device=device)
+        return padded.view(num_seqs, max_ranges, 2)
 
 
 class TritonAttentionMetadataBuilder(AttentionMetadataBuilder[TritonAttentionMetadata]):
diff --git a/vllm/v1/attention/backends/turboquant_attn.py b/vllm/v1/attention/backends/turboquant_attn.py
index af2d0fb0830f..53684b4360f7 100644
--- a/vllm/v1/attention/backends/turboquant_attn.py
+++ b/vllm/v1/attention/backends/turboquant_attn.py
@@ -187,6 +187,10 @@ class TurboQuantMetadata(AttentionMetadata):
     is_prefill: bool = False
     num_decodes: int = 0  # number of decode requests (first in batch)
     num_decode_tokens: int = 0  # tokens from decode requests
+    # CPU-resident copies used by the prefill path for per-request iteration
+    # without per-step D2H syncs.
+    query_start_loc_cpu: torch.Tensor | None = None
+    seq_lens_cpu: torch.Tensor | None = None
 
 
 class TurboQuantMetadataBuilder(AttentionMetadataBuilder[TurboQuantMetadata]):
@@ -230,6 +234,8 @@ def build(self, common_prefix_len, common_attn_metadata, fast_build=False):
             is_prefill=(cam.max_query_len > 1),
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
+            query_start_loc_cpu=cam.query_start_loc_cpu,
+            seq_lens_cpu=cam.seq_lens_cpu_upper_bound,
         )
 
 
@@ -474,11 +480,21 @@ def forward(
             # first-chunk prefills. Using full-batch max_seq_len breaks
             # this because decode requests inflate max_seq_len.
             prefill_seq_lens = attn_metadata.seq_lens[num_decodes:]
-            # Use CPU-side max to avoid GPU→CPU sync from .item()
-            prefill_max_seq = max(attn_metadata.seq_lens[num_decodes:].tolist())
+            # Use the CPU-resident `seq_lens` upper-bound from the metadata
+            # (populated in the builder) to compute the prefill sub-batch
+            # max without a GPU→CPU sync.
+            if attn_metadata.seq_lens_cpu is not None:
+                prefill_max_seq = int(attn_metadata.seq_lens_cpu[num_decodes:].max())
+            else:
+                prefill_max_seq = attn_metadata.max_seq_len
             prefill_qsl = (
                 attn_metadata.query_start_loc[num_decodes:] - num_decode_tokens
             )
+            prefill_qsl_cpu = None
+            if attn_metadata.query_start_loc_cpu is not None:
+                prefill_qsl_cpu = (
+                    attn_metadata.query_start_loc_cpu[num_decodes:] - num_decode_tokens
+                )
             prefill_meta = TurboQuantMetadata(
                 seq_lens=prefill_seq_lens,
                 slot_mapping=attn_metadata.slot_mapping[num_decode_tokens:N],
@@ -488,6 +504,10 @@ def forward(
                 max_query_len=attn_metadata.max_query_len,
                 max_seq_len=prefill_max_seq,
                 is_prefill=True,
+                query_start_loc_cpu=prefill_qsl_cpu,
+                seq_lens_cpu=attn_metadata.seq_lens_cpu[num_decodes:]
+                if attn_metadata.seq_lens_cpu is not None
+                else None,
             )
             k = key[:N].view(N, self.num_kv_heads, self.head_size)
             v = value[:N].view(N, self.num_kv_heads, self.head_size)
@@ -578,10 +598,16 @@ def _prefill_attention(
 
         output = torch.zeros(N, Hq, D, device=query.device, dtype=query.dtype)
 
-        # Convert to Python lists once (single CPU-GPU sync) instead of
-        # per-request .item() calls that each force a sync.
-        qsl = query_start_loc.tolist()
-        seq_lens_list = attn_metadata.seq_lens.tolist()
+        # Prefer the CPU-resident copies from the metadata if populated —
+        # otherwise `.tolist()` on GPU tensors forces a synchronizing copy.
+        if attn_metadata.query_start_loc_cpu is not None:
+            qsl = attn_metadata.query_start_loc_cpu.tolist()
+        else:
+            qsl = query_start_loc.tolist()
+        if attn_metadata.seq_lens_cpu is not None:
+            seq_lens_list = attn_metadata.seq_lens_cpu.tolist()
+        else:
+            seq_lens_list = attn_metadata.seq_lens.tolist()
 
         # Pre-allocate cu_seqlens for single-request flash_attn calls
         # to avoid per-request host→device tensor creation.
@@ -612,7 +638,8 @@ def _prefill_attention(
             if q_len == seq_len:
                 # First-chunk prefill: all K/V are in the current batch.
                 if _HAS_FLASH_ATTN:
-                    self._cu_2[1] = q_len
+                    # Assign to slice to avoid gpu/cpu sync.
+                    self._cu_2[1:2] = q_len
                     cu = self._cu_2
                     out = self._flash_attn_varlen(
                         q=q_seq,
@@ -791,8 +818,9 @@ def _continuation_prefill(
             if not hasattr(self, "_cu_2_q"):
                 self._cu_2_q = torch.zeros(2, device=device, dtype=torch.int32)
                 self._cu_2_k = torch.zeros(2, device=device, dtype=torch.int32)
-            self._cu_2_q[1] = q_len
-            self._cu_2_k[1] = seq_len
+            # Assigning to slice uses fill_ which avoids cpu/gpu sync.
+            self._cu_2_q[1:2] = q_len
+            self._cu_2_k[1:2] = seq_len
             cu_seqlens_q = self._cu_2_q
             cu_seqlens_k = self._cu_2_k
             return self._flash_attn_varlen(
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 54ebd088b95e..43cbcfec1844 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -332,8 +332,10 @@ def make_local_attention_virtual_batches(
     # regression when using numpy arrays (batch and block indices) to index into
     # torch tensor (block_table). As a workaround, convert numpy arrays to torch
     # tensor first, which recovers perf.
-    batch_indices_torch = torch.from_numpy(batch_indices)
-    block_indices_torch = torch.from_numpy(block_indices)
+    # Upload the index tensors to the block_table's device up-front so that the
+    # fancy indexing below doesn't implicitly force a synchronous H2D copy.
+    batch_indices_torch = torch.from_numpy(batch_indices).to(device, non_blocking=True)
+    block_indices_torch = torch.from_numpy(block_indices).to(device, non_blocking=True)
 
     # Save as a lambda so we can return this for update_block_table
     make_block_table = lambda block_table: block_table[
@@ -391,7 +393,16 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
 
     # Figure out how many tokens are in each request
     # num_decode_tokens: [1, 2, 1]
-    num_decode_tokens = torch.bincount(request_ids, minlength=num_reqs)
+    # Avoid `torch.bincount` here — on CUDA it forces a sync to determine
+    # the output size (even with `minlength`, the kernel must confirm no
+    # value exceeds the bound). `scatter_add_` into a preallocated buffer
+    # is equivalent and stays async.
+    num_decode_tokens = torch.zeros(
+        num_reqs, dtype=request_ids.dtype, device=request_ids.device
+    )
+    num_decode_tokens.scatter_add_(
+        0, request_ids.to(num_decode_tokens.dtype), torch.ones_like(request_ids)
+    )
 
     # Calculate new query_start_loc with tokens in generation_indices
     # decode_query_start_loc: [0, 1, 3, 4]
@@ -399,7 +410,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata(
         num_reqs + 1, device=query_start_loc.device, dtype=query_start_loc.dtype
     )
 
-    decode_query_start_loc[0] = 0
+    decode_query_start_loc[:1].fill_(0)  # Avoid sync from scalar assignment.
     decode_query_start_loc[1:] = torch.cumsum(num_decode_tokens, dim=0)
     decode_max_query_len = int(num_decode_tokens.max().item())
     total_num_decode_tokens = int(num_decode_tokens.sum().item())
diff --git a/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py b/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
index 84647d6120d8..68d33f1aa105 100644
--- a/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
+++ b/vllm/v1/attention/ops/deepseek_v4_ops/fused_inv_rope_fp8_quant.py
@@ -9,6 +9,7 @@
 
 import torch
 
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -242,6 +243,7 @@ def _fused_inv_rope_fp8_quant_kernel_impl(
         (scale_inner * tma_aligned_T, 1, tma_aligned_T),
     )
     grid = (tma_aligned_T, n_groups * heads_per_group)
+    pdl_kwargs = {} if current_platform.is_rocm() else {"launch_pdl": False}
     _fused_inv_rope_fp8_quant_per_head[grid](
         o,
         positions,
@@ -265,7 +267,7 @@ def _fused_inv_rope_fp8_quant_kernel_impl(
         HALF_ROPE=half_rope,
         TMA_ALIGNED_SCALES=tma_aligned_scales,
         num_stages=1,
-        launch_pdl=False,
+        **pdl_kwargs,
         num_warps=1,
     )
     return fp8_buf, scale_buf
diff --git a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
index 627d870b62ff..5d0343ffd607 100644
--- a/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
+++ b/vllm/v1/attention/ops/rocm_aiter_mla_sparse.py
@@ -2,9 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
 import importlib
+import math
 from importlib.util import find_spec
 
 import torch
+import torch.nn.functional as F
 
 from vllm.forward_context import get_forward_context
 from vllm.platforms import current_platform
@@ -13,6 +15,11 @@
 from vllm.v1.attention.backends.mla.indexer import DeepseekV32IndexerMetadata
 from vllm.v1.attention.ops.common import pack_seq_triton, unpack_seq_triton
 
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import _ON_GFX942
+else:
+    _ON_GFX942 = False
+
 
 @triton.jit
 def _indexer_k_quant_and_cache_kernel(
@@ -230,6 +237,43 @@ def fp8_paged_mqa_logits_torch(
 
     fp8_dtype = current_platform.fp8_dtype()
     batch_size, next_n, _, dim = q.size()
+    if next_n == 1:
+        block_size = kv_cache.shape[1]
+        logits = torch.full(
+            [batch_size, max_model_len],
+            float("-inf"),
+            device=q.device,
+            dtype=torch.float32,
+        )
+        if context_lens.dim() > 1:
+            context_lens = context_lens.squeeze(-1)
+        kv_cache_flat = kv_cache.view(-1, block_size * (dim + 4))
+        for i in range(batch_size):
+            q_i = q[i, 0].to(torch.float32)
+            q_scale = weights[i]
+            seq_len = int(context_lens[i].item())
+            assert seq_len <= max_model_len
+            num_pages = cdiv(seq_len, block_size)
+            padded_seq_len = num_pages * block_size
+            pages = block_tables[i, :num_pages]
+            cache = kv_cache_flat[pages]
+            scale_offset = block_size * dim
+            cache_value = (
+                cache[..., :scale_offset].view(dtype=fp8_dtype).to(torch.float32)
+            )
+            cache_scale = (
+                cache[..., scale_offset:].view(dtype=torch.float32).contiguous()
+            )
+            cache_value = cache_value.view(padded_seq_len, dim)
+            cache_scale = cache_scale.view(padded_seq_len)
+            score = F.linear(cache_value, q_i)
+            score = F.relu(score)
+            score *= q_scale[None, :]
+            score = score.sum(dim=1)
+            score *= cache_scale
+            logits[i, :seq_len] = score[:seq_len]
+        return logits
+
     kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
     scale = scale.contiguous().view(torch.float)
     q = q.float()
@@ -241,20 +285,30 @@ def fp8_paged_mqa_logits_torch(
         device=q.device,
         dtype=torch.float32,
     )
-    context_lens = context_lens.tolist()
     for i in range(batch_size):
         context_len = context_lens[i]
-        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        if context_len.ndim == 0:
+            context_len_i = int(context_len.item())
+            q_offsets = torch.arange(
+                context_len_i - next_n, context_len_i, device=q.device
+            )
+            context_limit = torch.full(
+                (next_n,), context_len_i, dtype=torch.int32, device=q.device
+            )
+        else:
+            context_limit = context_len.to(device=q.device, dtype=torch.int32)
+            q_offsets = context_limit - 1
         weight_slice = (
             weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
         )
-        for block_rk in range(cdiv(context_len, block_size)):
+        max_context_len = int(context_limit.max().item())
+        for block_rk in range(cdiv(max_context_len, block_size)):
             block_idx = block_tables[i][block_rk]
             qx, kx = q[i], kv_cache[block_idx]
             k_offsets = torch.arange(
-                block_rk * block_size, (block_rk + 1) * block_size, device="cuda"
+                block_rk * block_size, (block_rk + 1) * block_size, device=q.device
             )
-            mask = (k_offsets[None, :] < context_len) & (
+            mask = (k_offsets[None, :] < context_limit[:, None]) & (
                 k_offsets[None, :] <= q_offsets[:, None]
             )
             s = torch.where(
@@ -331,30 +385,52 @@ def rocm_fp8_paged_mqa_logits(
         aiter_paged_mqa_logits_module = paged_mqa_logits_module()
 
     if aiter_paged_mqa_logits_module is not None:
-        deepgemm_fp8_paged_mqa_logits = (
-            aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits
+        if _ON_GFX942:
+            deepgemm_fp8_paged_mqa_logits = (
+                aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits
+            )
+            batch_size, next_n, heads, _ = q_fp8.shape
+            out_logits = torch.full(
+                [batch_size * next_n, max_model_len],
+                float("-inf"),
+                device="cuda",
+                dtype=torch.float32,
+            )
+            deepgemm_fp8_paged_mqa_logits(
+                q_fp8,
+                kv_cache_fp8,
+                weights,
+                out_logits,
+                context_lens,
+                block_tables,
+                max_model_len,
+                ChunkK=256,
+                Preshuffle=block_size == 64,
+                KVBlockSize=block_size,
+                WavePerEU=2,
+            )
+            return out_logits
+        deepgemm_fp8_paged_mqa_logits_stage1 = (
+            aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits_stage1
         )
         batch_size, next_n, heads, _ = q_fp8.shape
-        out_logits = torch.full(
-            [batch_size * next_n, max_model_len],
+        out_qk = torch.full(
+            (heads, batch_size * next_n, max_model_len),
             float("-inf"),
             device="cuda",
             dtype=torch.float32,
         )
-        deepgemm_fp8_paged_mqa_logits(
+        deepgemm_fp8_paged_mqa_logits_stage1(
             q_fp8,
             kv_cache_fp8,
             weights,
-            out_logits,
+            out_qk,
             context_lens,
             block_tables,
             max_model_len,
-            ChunkK=256,
-            Preshuffle=block_size == 64,
-            KVBlockSize=block_size,
-            WavePerEU=2,
+            ChunkQ=heads,
         )
-        return out_logits
+        return out_qk.sum(dim=0)
     else:
         return fp8_paged_mqa_logits_torch(
             q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len
@@ -464,6 +540,27 @@ def rocm_fp8_mqa_logits(
         return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
 
 
+def _topk_indices_torch(logits: torch.Tensor, topk_tokens: int) -> torch.Tensor:
+    k = min(topk_tokens, logits.shape[-1])
+    values, indices = torch.topk(logits, k=k, dim=-1)
+    indices = indices.to(torch.int32)
+    indices = torch.where(
+        values == float("-inf"),
+        torch.full_like(indices, -1, dtype=torch.int32),
+        indices,
+    )
+    if k == topk_tokens:
+        return indices
+    padded = torch.full(
+        (logits.shape[0], topk_tokens),
+        -1,
+        dtype=torch.int32,
+        device=logits.device,
+    )
+    padded[:, :k] = indices
+    return padded
+
+
 def rocm_aiter_sparse_attn_indexer_fake(
     hidden_states: torch.Tensor,
     k_cache_prefix: LayerNameType,
@@ -482,8 +579,9 @@ def rocm_aiter_sparse_attn_indexer_fake(
     # profile run
     # NOTE(Chen): create the max possible flattened_kv. So that
     # profile_run can get correct memory usage.
+    device = hidden_states.device if k is None else k.device
     _flattened_kv = torch.empty(
-        [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
+        [total_seq_lens, head_dim + 4], device=device, dtype=torch.uint8
     )
     fp8_dtype = current_platform.fp8_dtype()
     _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
@@ -491,7 +589,7 @@ def rocm_aiter_sparse_attn_indexer_fake(
     return topk_indices_buffer
 
 
-def rocm_aiter_sparse_attn_indexer(
+def rocm_aiter_sparse_attn_indexer_native(
     hidden_states: torch.Tensor,
     k_cache_prefix: LayerNameType,
     kv_cache: torch.Tensor,
@@ -505,10 +603,12 @@ def rocm_aiter_sparse_attn_indexer(
     max_model_len: int,
     total_seq_lens: int,
     topk_indices_buffer: torch.Tensor | None,
+    skip_k_cache_insert: bool = False,
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
     fp8_dtype = current_platform.fp8_dtype()
+    from vllm import _custom_ops as ops
     from vllm.utils.torch_utils import _resolve_layer_name
 
     k_cache_prefix = _resolve_layer_name(k_cache_prefix)
@@ -537,19 +637,33 @@ def rocm_aiter_sparse_attn_indexer(
     has_decode = layer_attn_metadata.num_decodes > 0
     has_prefill = layer_attn_metadata.num_prefills > 0
     num_decode_tokens = layer_attn_metadata.num_decode_tokens
+    device = hidden_states.device if k is None else k.device
 
     # during speculative decoding, k may be padded to the CUDA graph batch
     # size while slot_mapping only covers actual tokens.
     num_tokens = slot_mapping.shape[0]
-    k = k[:num_tokens]
+    if k is not None:
+        k = k[:num_tokens]
+    elif not skip_k_cache_insert:
+        raise ValueError("k must be provided when skip_k_cache_insert is False")
 
-    indexer_k_quant_and_cache_triton(
-        k,
-        kv_cache,
-        slot_mapping,
-        quant_block_size,
-        scale_fmt,
-    )
+    if not skip_k_cache_insert:
+        if _ON_GFX942:
+            ops.indexer_k_quant_and_cache(
+                k,
+                kv_cache,
+                slot_mapping,
+                quant_block_size,
+                scale_fmt,
+            )
+        else:
+            indexer_k_quant_and_cache_triton(
+                k,
+                kv_cache,
+                slot_mapping,
+                quant_block_size,
+                scale_fmt,
+            )
 
     topk_indices_buffer[: hidden_states.shape[0]] = -1
     if has_prefill:
@@ -558,22 +672,31 @@ def rocm_aiter_sparse_attn_indexer(
         for chunk in prefill_metadata.chunks:
             k_fp8 = torch.empty(
                 [chunk.total_seq_lens, head_dim],
-                device=k.device,
+                device=device,
                 dtype=fp8_dtype,
             )
             k_scale = torch.empty(
                 [chunk.total_seq_lens, 4],
-                device=k.device,
+                device=device,
                 dtype=torch.uint8,
             )
-            cp_gather_indexer_k_quant_cache_triton(
-                kv_cache,
-                k_fp8,
-                k_scale,
-                chunk.block_table,
-                chunk.cu_seq_lens,
-                chunk.token_to_seq,
-            )
+            if _ON_GFX942:
+                ops.cp_gather_indexer_k_quant_cache(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                )
+            else:
+                cp_gather_indexer_k_quant_cache_triton(
+                    kv_cache,
+                    k_fp8,
+                    k_scale,
+                    chunk.block_table,
+                    chunk.cu_seq_lens,
+                    token_to_seq=chunk.token_to_seq,
+                )
 
             logits = rocm_fp8_mqa_logits(
                 q_fp8[chunk.token_start : chunk.token_end],
@@ -582,21 +705,10 @@ def rocm_aiter_sparse_attn_indexer(
                 chunk.cu_seqlen_ks,
                 chunk.cu_seqlen_ke,
             )
-            num_rows = logits.shape[0]
-            assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
             topk_indices = topk_indices_buffer[
                 chunk.token_start : chunk.token_end, :topk_tokens
             ]
-            torch.ops._C.top_k_per_row_prefill(
-                logits,
-                chunk.cu_seqlen_ks,
-                chunk.cu_seqlen_ke,
-                topk_indices,
-                num_rows,
-                logits.stride(0),
-                logits.stride(1),
-                topk_tokens,
-            )
+            topk_indices.copy_(_topk_indices_torch(logits, topk_tokens))
 
     if has_decode:
         decode_metadata = layer_attn_metadata.decode
@@ -633,19 +745,8 @@ def rocm_aiter_sparse_attn_indexer(
             max_model_len=max_model_len,
         )
 
-        num_rows = logits.shape[0]
-        assert topk_tokens == 2048, "top_k_per_row assumes size 2048"
         topk_indices = topk_indices_buffer[:num_decode_tokens, :topk_tokens]
-        torch.ops._C.top_k_per_row_decode(
-            logits,
-            next_n,
-            decode_metadata.seq_lens,
-            topk_indices,
-            num_rows,
-            logits.stride(0),
-            logits.stride(1),
-            topk_tokens,
-        )
+        topk_indices.copy_(_topk_indices_torch(logits, topk_tokens)[:num_decode_tokens])
 
         if decode_metadata.requires_padding:
             # if padded, we need to unpack
@@ -659,3 +760,370 @@ def rocm_aiter_sparse_attn_indexer(
             )
 
     return topk_indices_buffer
+
+
+def rocm_aiter_sparse_attn_indexer(
+    hidden_states: torch.Tensor,
+    k_cache_prefix: LayerNameType,
+    kv_cache: torch.Tensor,
+    q_fp8: torch.Tensor,
+    k: torch.Tensor,
+    weights: torch.Tensor,
+    quant_block_size: int,
+    scale_fmt: str | None,
+    topk_tokens: int,
+    head_dim: int,
+    max_model_len: int,
+    total_seq_lens: int,
+    topk_indices_buffer: torch.Tensor | None,
+) -> torch.Tensor:
+    return rocm_aiter_sparse_attn_indexer_native(
+        hidden_states,
+        k_cache_prefix,
+        kv_cache,
+        q_fp8,
+        k,
+        weights,
+        quant_block_size,
+        scale_fmt,
+        topk_tokens,
+        head_dim,
+        max_model_len,
+        total_seq_lens,
+        topk_indices_buffer,
+        skip_k_cache_insert=False,
+    )
+
+
+def _decode_e8m0_scales(scale: torch.Tensor) -> torch.Tensor:
+    if scale.dtype == torch.float8_e8m0fnu:
+        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+            _upcast_e8m0_to_fp32,
+        )
+
+        return _upcast_e8m0_to_fp32(scale).contiguous()
+    return scale.to(torch.float32)
+
+
+def _expand_2d_block_scales(
+    scale: torch.Tensor,
+    rows: int,
+    cols: int,
+) -> torch.Tensor:
+    scale = _decode_e8m0_scales(scale)
+    row_blocks, col_blocks = scale.shape[-2:]
+    row_block = math.ceil(rows / row_blocks)
+    col_block = math.ceil(cols / col_blocks)
+    scale = torch.repeat_interleave(scale, row_block, dim=-2)[..., :rows, :]
+    scale = torch.repeat_interleave(scale, col_block, dim=-1)[..., :, :cols]
+    return scale
+
+
+def _apply_gptj_inv_rope_ref(
+    x: torch.Tensor,
+    positions: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    rope_dim: int,
+) -> torch.Tensor:
+    if rope_dim == 0 or x.numel() == 0:
+        return x
+    half_rot = rope_dim // 2
+    nope_dim = x.shape[-1] - rope_dim
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    cache = cos_sin_cache.index_select(0, positions.to(torch.long))
+    cos = cache[:, :half_rot].to(torch.float32)
+    sin = cache[:, half_rot : 2 * half_rot].to(torch.float32)
+    view_shape = (positions.shape[0],) + (1,) * (x.dim() - 2) + (half_rot,)
+    cos = cos.view(view_shape)
+    sin = sin.view(view_shape)
+    rope = x[..., nope_dim:]
+    y_even = rope[..., 0::2]
+    y_odd = rope[..., 1::2]
+    rope_out = torch.stack(
+        (y_even * cos + y_odd * sin, y_odd * cos - y_even * sin),
+        dim=-1,
+    ).flatten(-2)
+    x = x.clone()
+    x[..., nope_dim:] = rope_out
+    return x.to(dtype)
+
+
+def _apply_inv_rope_ref(
+    rotary_emb: torch.nn.Module,
+    x: torch.Tensor,
+    positions: torch.Tensor,
+    rope_dim: int,
+) -> torch.Tensor:
+    if hasattr(rotary_emb, "forward_native"):
+        try:
+            query, _ = rotary_emb.forward_native(
+                positions,
+                x.clone(),
+                None,
+                inverse=True,
+            )
+            return query
+        except TypeError:
+            pass
+    return _apply_gptj_inv_rope_ref(x, positions, rotary_emb.cos_sin_cache, rope_dim)
+
+
+def rocm_inv_rope_einsum(
+    rotary_emb: torch.nn.Module,
+    o: torch.Tensor,
+    positions: torch.Tensor,
+    rope_head_dim: int,
+    n_local_groups: int,
+    o_lora_rank: int,
+    wo_a: torch.nn.Module,
+) -> torch.Tensor:
+    """Reference inverse-RoPE + WO_A einsum path used on ROCm."""
+    o_ref = _apply_inv_rope_ref(rotary_emb, o, positions, rope_head_dim).to(
+        torch.bfloat16
+    )
+    o_ref = o_ref.view(o.shape[0], n_local_groups, -1)
+
+    hidden_dim = o_ref.shape[-1]
+    if hasattr(wo_a, "weight_scale_inv"):
+        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
+            torch.float32
+        )
+        wo_a_scale = _expand_2d_block_scales(
+            wo_a.weight_scale_inv.view(
+                n_local_groups, -1, wo_a.weight_scale_inv.shape[-1]
+            ),
+            o_lora_rank,
+            hidden_dim,
+        )
+        wo_a_weight = (wo_a_weight * wo_a_scale).to(torch.bfloat16)
+    else:
+        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
+            torch.bfloat16
+        )
+
+    return torch.einsum("tgd,grd->tgr", o_ref, wo_a_weight)
+
+
+def rocm_ref_sparse_attn_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+) -> torch.Tensor:
+    indices = indices.clone().squeeze(1)
+    s_q, h_q, d_qk = q.shape
+    topk = indices.shape[-1]
+    s_kv = kv.shape[0]
+    if topk_length is not None:
+        mask = torch.arange(topk, device=indices.device).unsqueeze(
+            0
+        ) >= topk_length.unsqueeze(1)
+        indices[mask] = -1
+    invalid_mask = (indices < 0) | (indices >= s_kv)
+    indices[invalid_mask] = 0
+
+    qf = q.float()
+    gathered_kv = kv.index_select(0, indices.flatten()).reshape(s_q, topk, d_qk).float()
+    scores = qf @ gathered_kv.transpose(1, 2)
+    scores *= scale
+    scores[invalid_mask.unsqueeze(1).expand_as(scores)] = float("-inf")
+
+    orig_lse = torch.logsumexp(scores, dim=-1)
+    lse_for_o = orig_lse
+    if attn_sink is not None:
+        lse_for_o = torch.logsumexp(
+            torch.stack(
+                [orig_lse, attn_sink[:h_q].view(1, h_q).expand_as(orig_lse)],
+                dim=0,
+            ),
+            dim=0,
+        )
+    lse_for_o = lse_for_o.clone()
+    lse_for_o[lse_for_o == float("-inf")] = float("+inf")
+    probs = torch.exp(scores - lse_for_o.unsqueeze(-1))
+    out = probs @ gathered_kv[..., :head_dim]
+    lonely_q_mask = orig_lse == float("-inf")
+    out[lonely_q_mask.unsqueeze(-1).expand_as(out)] = 0.0
+    return out.to(torch.bfloat16)
+
+
+def rocm_sparse_attn_prefill(
+    q: torch.Tensor,
+    kv: torch.Tensor,
+    indices: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+    output: torch.Tensor,
+) -> None:
+    output_chunk = rocm_ref_sparse_attn_prefill(
+        q=q,
+        kv=kv,
+        indices=indices,
+        topk_length=topk_length,
+        scale=scale,
+        head_dim=head_dim,
+        attn_sink=attn_sink,
+    )
+    output.copy_(output_chunk.to(output.dtype))
+
+
+def rocm_dequantize_blocked_k_cache(
+    quant_k_cache: torch.Tensor,
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+) -> torch.Tensor:
+    fp8_dtype = current_platform.fp8_dtype()
+    tile_size = 64
+    num_tiles = nope_head_dim // tile_size
+
+    num_blocks, block_size, _ = quant_k_cache.shape
+    quant_k_cache = quant_k_cache.view(num_blocks, -1)
+    input_nope_rope = quant_k_cache[
+        :, : block_size * (nope_head_dim + 2 * rope_head_dim)
+    ].view(num_blocks, block_size, nope_head_dim + 2 * rope_head_dim)
+    input_nope = input_nope_rope[:, :, :nope_head_dim].view(fp8_dtype)
+    input_rope = input_nope_rope[:, :, nope_head_dim:].view(torch.bfloat16)
+    input_scale = (
+        quant_k_cache[:, block_size * (nope_head_dim + 2 * rope_head_dim) :]
+        .view(num_blocks, block_size, 8)[:, :, :num_tiles]
+        .view(torch.float8_e8m0fnu)
+    )
+
+    result = torch.empty(
+        (num_blocks, block_size, 1, head_dim),
+        dtype=torch.bfloat16,
+        device=quant_k_cache.device,
+    )
+    result[..., nope_head_dim:] = input_rope.unsqueeze(2)
+    for tile_idx in range(num_tiles):
+        cur_nope = input_nope[
+            ..., tile_idx * tile_size : (tile_idx + 1) * tile_size
+        ].to(torch.bfloat16)
+        cur_scales = input_scale[:, :, tile_idx].to(torch.bfloat16).unsqueeze(-1)
+        result[..., tile_idx * tile_size : (tile_idx + 1) * tile_size] = (
+            cur_nope * cur_scales
+        ).unsqueeze(2)
+    return result
+
+
+def rocm_ref_sparse_attn_decode(
+    q: torch.Tensor,
+    blocked_k: torch.Tensor,
+    indices_in_kvcache: torch.Tensor,
+    topk_length: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    attn_sink: torch.Tensor | None,
+    extra_blocked_k: torch.Tensor | None = None,
+    extra_indices_in_kvcache: torch.Tensor | None = None,
+    extra_topk_length: torch.Tensor | None = None,
+) -> torch.Tensor:
+    b, s_q, h_q, d_qk = q.shape
+
+    def process_scope(
+        cur_blocked_k: torch.Tensor,
+        cur_indices: torch.Tensor,
+        cur_topk_length: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        cur_indices = cur_indices.reshape(b, s_q, -1)
+        topk = cur_indices.size(-1)
+        fixed_indices = torch.clamp_min(cur_indices, 0)
+        gathered_kv = (
+            cur_blocked_k.view(-1, d_qk)
+            .index_select(0, fixed_indices.view(-1))
+            .view(b, s_q, topk, d_qk)
+        )
+        invalid_mask = cur_indices == -1
+        if cur_topk_length is not None:
+            cur_topk_length = cur_topk_length.reshape(b)
+            invalid_mask |= torch.arange(0, topk, device=invalid_mask.device).view(
+                1, 1, topk
+            ) >= cur_topk_length.view(b, 1, 1)
+        return gathered_kv, invalid_mask
+
+    gathered_kv, invalid_mask = process_scope(
+        blocked_k, indices_in_kvcache, topk_length
+    )
+    if extra_blocked_k is not None:
+        assert extra_indices_in_kvcache is not None
+        gathered_kv1, invalid_mask1 = process_scope(
+            extra_blocked_k, extra_indices_in_kvcache, extra_topk_length
+        )
+        gathered_kv = torch.cat([gathered_kv, gathered_kv1], dim=2)
+        invalid_mask = torch.cat([invalid_mask, invalid_mask1], dim=2)
+
+    gathered_kv = gathered_kv.view(b * s_q, -1, d_qk).float()
+    gathered_kv[gathered_kv != gathered_kv] = 0.0
+    qf = q.float().view(b * s_q, h_q, d_qk)
+    attn_weight = qf @ gathered_kv.transpose(-1, -2)
+    attn_weight *= scale
+    attn_weight[
+        invalid_mask.view(b * s_q, 1, -1).expand(b * s_q, h_q, invalid_mask.size(-1))
+    ] = float("-inf")
+    lse = attn_weight.logsumexp(dim=-1)
+    attn_weight = torch.exp(attn_weight - lse.unsqueeze(-1))
+    output = attn_weight @ gathered_kv[..., :head_dim]
+    output = output.view(b, s_q, h_q, head_dim)
+    lse = lse.view(b, s_q, h_q)
+
+    if attn_sink is not None:
+        output *= (1.0 / (1.0 + torch.exp(attn_sink.view(1, 1, h_q) - lse))).unsqueeze(
+            -1
+        )
+
+    lonely_q_mask = lse == float("-inf")
+    output[lonely_q_mask.unsqueeze(-1).expand_as(output)] = 0.0
+    return output.squeeze(1).to(torch.bfloat16)
+
+
+def rocm_forward_decode_fallback(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor | None,
+    swa_k_cache: torch.Tensor,
+    swa_only: bool,
+    topk_indices: torch.Tensor | None,
+    topk_lens: torch.Tensor | None,
+    swa_indices: torch.Tensor,
+    swa_lens: torch.Tensor,
+    attn_sink: torch.Tensor | None,
+    scale: float,
+    head_dim: int,
+    nope_head_dim: int,
+    rope_head_dim: int,
+    output: torch.Tensor,
+) -> None:
+    blocked_swa = rocm_dequantize_blocked_k_cache(
+        swa_k_cache,
+        head_dim=head_dim,
+        nope_head_dim=nope_head_dim,
+        rope_head_dim=rope_head_dim,
+    )
+    blocked_extra = None
+    if not swa_only:
+        assert kv_cache is not None
+        blocked_extra = rocm_dequantize_blocked_k_cache(
+            kv_cache,
+            head_dim=head_dim,
+            nope_head_dim=nope_head_dim,
+            rope_head_dim=rope_head_dim,
+        )
+    attn_out = rocm_ref_sparse_attn_decode(
+        q=q.unsqueeze(1),
+        blocked_k=blocked_swa,
+        indices_in_kvcache=swa_indices.unsqueeze(1),
+        topk_length=swa_lens,
+        scale=scale,
+        head_dim=head_dim,
+        attn_sink=attn_sink[: q.shape[1]] if attn_sink is not None else None,
+        extra_blocked_k=blocked_extra,
+        extra_indices_in_kvcache=topk_indices,
+        extra_topk_length=topk_lens,
+    )
+    output.copy_(attn_out.to(output.dtype))
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 9097079ef33a..7ffc7bac5f7c 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable, Sequence
-from typing import Any
+from typing import Any, Protocol
 
 from vllm.distributed.kv_events import (
     MEDIUM_GPU,
@@ -31,6 +31,119 @@
 logger = init_logger(__name__)
 
 
+class BlockPoolProtocol(Protocol):
+    """Basic allocation contract shared by all KV cache block pools.
+
+    Implementations must reserve ``block_id=0`` as the null sentinel. Allocation
+    methods must never return the null block, and backing tensors must be sized
+    so that block index 0 is always valid.
+    """
+
+    num_gpu_blocks: int
+    null_block: KVCacheBlock
+
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]: ...
+
+    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None: ...
+
+    def get_num_free_blocks(self) -> int: ...
+
+    def get_usage(self) -> float: ...
+
+    def take_events(self) -> list[KVCacheEvent]: ...
+
+
+class CacheableBlockPoolProtocol(BlockPoolProtocol, Protocol):
+    """Block-pool contract for pools that also support prefix caching."""
+
+    def get_cached_block(
+        self, block_hash: BlockHash, kv_cache_group_ids: list[int]
+    ) -> list[KVCacheBlock] | None: ...
+
+    def cache_full_blocks(
+        self,
+        request: Request,
+        blocks: list[KVCacheBlock],
+        num_cached_blocks: int,
+        num_full_blocks: int,
+        block_size: int,
+        kv_cache_group_id: int,
+    ) -> None: ...
+
+    def touch(self, blocks: Sequence[KVCacheBlock]) -> None: ...
+
+    def evict_blocks(self, block_ids: set[int]) -> None: ...
+
+    def reset_prefix_cache(self) -> bool: ...
+
+
+class CompactBlockPool:
+    """Compact allocation-only pool for request-constant KV-cache specs.
+
+    Invariants enforced by this pool:
+      (1) ``block_id=0`` is reserved as the null sentinel.
+      (2) Backing tensors must be sized for ``num_allocatable + 1`` blocks.
+      (3) Allocation never returns the null block.
+      (4) ``ref_cnt`` is binary: 0 when free, 1 when allocated.
+      (5) ``get_new_blocks(0)`` returns ``[]`` and ``free_blocks([])`` is a no-op.
+      (6) Freeing a null block or a non-allocated block is rejected.
+
+    This pool deliberately does not implement prefix-caching APIs such as
+    ``touch``, ``get_cached_block``, or ``cache_full_blocks``.
+    """
+
+    def __init__(self, num_allocatable: int) -> None:
+        assert isinstance(num_allocatable, int) and num_allocatable >= 0
+        self._num_allocatable = num_allocatable
+        self.num_gpu_blocks = num_allocatable + 1
+        self.null_block = KVCacheBlock(block_id=0)
+        self.null_block.is_null = True
+        self._free: list[KVCacheBlock] = [
+            KVCacheBlock(block_id=i) for i in range(1, num_allocatable + 1)
+        ]
+
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
+        if num_blocks == 0:
+            return []
+        if num_blocks > self.get_num_free_blocks():
+            raise ValueError(f"Cannot get {num_blocks} free blocks from the pool")
+
+        blocks = [self._free.pop() for _ in range(num_blocks)]
+        for block in blocks:
+            assert block.block_id != 0
+            assert not block.is_null
+            assert block.ref_cnt == 0
+            block.ref_cnt += 1
+            assert block.ref_cnt == 1
+        return blocks
+
+    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
+        blocks = list(ordered_blocks)
+        if not blocks:
+            return
+        for block in blocks:
+            assert block.block_id != 0, "null block must never be freed"
+            assert not block.is_null, "null block must never be freed"
+            assert block.ref_cnt == 1, (
+                "CompactBlockPool expects binary ref_cnt semantics"
+            )
+            block.ref_cnt -= 1
+            assert block.ref_cnt == 0
+        self._free.extend(blocks)
+
+    def get_num_free_blocks(self) -> int:
+        return len(self._free)
+
+    def get_usage(self) -> float:
+        if self._num_allocatable == 0:
+            return 0
+        return 1.0 - (self.get_num_free_blocks() / self._num_allocatable)
+
+    def take_events(self) -> list[KVCacheEvent]:
+        """Compact pools do not emit prefix-cache events."""
+        return []
+
+
 class BlockHashToBlockMap:
     """
     Cache of blocks that are used for prefix caching. It caches blocks
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 65993e804153..f0fd97f3b1e3 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -3,8 +3,14 @@
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from math import lcm
+from typing import cast
 
-from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.block_pool import (
+    BlockPool,
+    BlockPoolProtocol,
+    CacheableBlockPoolProtocol,
+    CompactBlockPool,
+)
 from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
@@ -20,7 +26,9 @@
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheConfig,
+    KVCachePoolConfig,
     KVCacheSpec,
+    MemoryModel,
 )
 from vllm.v1.request import Request
 
@@ -47,13 +55,18 @@ def __init__(
         self.max_model_len = max_model_len
         self.enable_caching = enable_caching
 
-        self.block_pool = BlockPool(
-            kv_cache_config.num_blocks,
-            enable_caching,
-            hash_block_size,
+        self._check_pool_config_supported()
+        self._block_pools = self._make_block_pools(
             enable_kv_cache_events,
+            hash_block_size,
             metrics_collector,
         )
+        # Public legacy alias. Existing consumers assume a single shared pool.
+        self.block_pool = self._block_pools[0]
+        self._group_to_pool = tuple(
+            self._block_pools[pool_id]
+            for pool_id in self.kv_cache_config.group_to_pool_id
+        )
 
         # KV cache group indices that get the EAGLE last-block drop.
         self.eagle_group_ids: set[int] = {
@@ -68,7 +81,7 @@ def __init__(
                 kv_cache_spec=kv_cache_group.kv_cache_spec,
                 max_num_batched_tokens=max_num_batched_tokens,
                 max_model_len=max_model_len,
-                block_pool=self.block_pool,
+                block_pool=self._group_to_pool[i],
                 enable_caching=enable_caching,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
@@ -77,6 +90,68 @@ def __init__(
             for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups)
         )
 
+    def _check_pool_config_supported(self) -> None:
+        """Reject unsupported multi-pool prefix-cache configs."""
+        pool_ids = set(self.kv_cache_config.group_to_pool_id)
+        num_distinct_pools = max(
+            len(self.kv_cache_config.pool_configs),
+            len(pool_ids),
+        )
+        if num_distinct_pools > 1 and self.enable_caching:
+            raise NotImplementedError(
+                "KVCacheCoordinator currently supports multi-pool configs only "
+                "when prefix caching is disabled. Got "
+                f"{num_distinct_pools} distinct pools in kv_cache_config. "
+                "Pool-aware prefix-cache dispatch is not implemented yet."
+            )
+
+    def _make_block_pools(
+        self,
+        enable_kv_cache_events: bool,
+        hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None,
+    ) -> tuple[BlockPoolProtocol, ...]:
+        if not self.kv_cache_config.pool_configs:
+            return (
+                BlockPool(
+                    self.kv_cache_config.num_blocks,
+                    self.enable_caching,
+                    hash_block_size,
+                    enable_kv_cache_events,
+                    metrics_collector,
+                ),
+            )
+
+        return tuple(
+            self._make_block_pool(
+                pool_config,
+                enable_kv_cache_events,
+                hash_block_size,
+                metrics_collector,
+            )
+            for pool_config in self.kv_cache_config.pool_configs
+        )
+
+    def _make_block_pool(
+        self,
+        pool_config: KVCachePoolConfig,
+        enable_kv_cache_events: bool,
+        hash_block_size: int,
+        metrics_collector: KVCacheMetricsCollector | None,
+    ) -> BlockPoolProtocol:
+        if pool_config.memory_model == MemoryModel.TOKEN_PROPORTIONAL:
+            return BlockPool(
+                pool_config.num_blocks,
+                self.enable_caching,
+                hash_block_size,
+                enable_kv_cache_events,
+                metrics_collector,
+            )
+        if pool_config.memory_model == MemoryModel.REQUEST_CONSTANT:
+            assert not self.enable_caching
+            return CompactBlockPool(num_allocatable=pool_config.num_blocks - 1)
+        raise AssertionError(f"Unsupported KV cache memory model: {pool_config}")
+
     def get_num_blocks_to_allocate(
         self,
         request_id: str,
@@ -110,12 +185,35 @@ def get_num_blocks_to_allocate(
         Returns:
             The number of blocks to allocate.
         """
-        num_blocks_to_allocate = 0
+        return sum(
+            self.get_num_blocks_to_allocate_by_pool(
+                request_id,
+                num_tokens,
+                new_computed_blocks,
+                num_encoder_tokens,
+                total_computed_tokens,
+                num_tokens_main_model,
+                apply_admission_cap=apply_admission_cap,
+            )
+        )
+
+    def get_num_blocks_to_allocate_by_pool(
+        self,
+        request_id: str,
+        num_tokens: int,
+        new_computed_blocks: tuple[Sequence[KVCacheBlock], ...],
+        num_encoder_tokens: int,
+        total_computed_tokens: int,
+        num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
+    ) -> tuple[int, ...]:
+        """Get the number of blocks needed from each KV cache pool."""
+        num_blocks_to_allocate = [0] * len(self.kv_cache_config.pool_configs)
         for i, manager in enumerate(self.single_type_managers):
             if isinstance(manager, CrossAttentionManager):
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
-                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                required_blocks = manager.get_num_blocks_to_allocate(
                     request_id,
                     num_encoder_tokens,
                     [],
@@ -124,7 +222,7 @@ def get_num_blocks_to_allocate(
                     apply_admission_cap=apply_admission_cap,
                 )
             else:
-                num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
+                required_blocks = manager.get_num_blocks_to_allocate(
                     request_id,
                     num_tokens,
                     new_computed_blocks[i],
@@ -132,7 +230,16 @@ def get_num_blocks_to_allocate(
                     num_tokens_main_model,
                     apply_admission_cap=apply_admission_cap,
                 )
-        return num_blocks_to_allocate
+            pool_id = self.kv_cache_config.group_to_pool_id[i]
+            num_blocks_to_allocate[pool_id] += required_blocks
+        return tuple(num_blocks_to_allocate)
+
+    def get_num_free_blocks_by_pool(self) -> tuple[int, ...]:
+        """Get the number of currently free blocks in each KV cache pool."""
+        return tuple(
+            self._block_pools[pool_id].get_num_free_blocks()
+            for pool_id in range(len(self.kv_cache_config.pool_configs))
+        )
 
     def allocate_new_computed_blocks(
         self,
@@ -375,11 +482,12 @@ def find_longest_cache_hit(
         block_hashes: list[BlockHash],
         max_cache_hit_length: int,
     ) -> tuple[tuple[list[KVCacheBlock], ...], int]:
+        block_pool = cast(CacheableBlockPoolProtocol, self.block_pool)
         hit_blocks = self.single_type_managers[0].find_longest_cache_hit(
             block_hashes=block_hashes,
             max_length=max_cache_hit_length,
             kv_cache_group_ids=[0],
-            block_pool=self.block_pool,
+            block_pool=block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=0 in self.eagle_group_ids,
             alignment_tokens=self.block_size,
@@ -553,11 +661,12 @@ def _get_block_hashes(kv_cache_spec: KVCacheSpec) -> BlockHashList:
                     _max_length = min(
                         curr_hit_length + spec.block_size, max_cache_hit_length
                     )
+                block_pool = cast(CacheableBlockPoolProtocol, self.block_pool)
                 hit_blocks = manager_cls.find_longest_cache_hit(
                     block_hashes=_get_block_hashes(spec),
                     max_length=_max_length,
                     kv_cache_group_ids=group_ids,
-                    block_pool=self.block_pool,
+                    block_pool=block_pool,
                     kv_cache_spec=spec,
                     use_eagle=use_eagle,
                     alignment_tokens=self.lcm_block_size,
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 431776870cf4..c1d7ced1095a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -4,10 +4,11 @@
 import itertools
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Literal, overload
+from typing import Literal, cast, overload
 
 from vllm.distributed.kv_events import KVCacheEvent
 from vllm.logger import init_logger
+from vllm.v1.core.block_pool import CacheableBlockPoolProtocol
 from vllm.v1.core.kv_cache_coordinator import get_kv_cache_coordinator
 from vllm.v1.core.kv_cache_metrics import KVCacheMetricsCollector
 from vllm.v1.core.kv_cache_utils import KVCacheBlock
@@ -168,6 +169,10 @@ def usage(self) -> float:
         """
         return self.block_pool.get_usage()
 
+    def _cacheable_block_pool(self) -> CacheableBlockPoolProtocol:
+        """Return the legacy shared pool as a prefix-cache-capable pool."""
+        return cast(CacheableBlockPoolProtocol, self.block_pool)
+
     def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
         """Get (and reset) the prefix cache stats.
 
@@ -180,6 +185,38 @@ def make_prefix_cache_stats(self) -> PrefixCacheStats | None:
         self.prefix_cache_stats = PrefixCacheStats()
         return stats
 
+    def _has_enough_free_blocks_by_pool(
+        self, num_blocks_to_allocate_by_pool: tuple[int, ...]
+    ) -> bool:
+        num_free_blocks_by_pool = self.coordinator.get_num_free_blocks_by_pool()
+        assert len(num_blocks_to_allocate_by_pool) == len(num_free_blocks_by_pool)
+        return all(
+            num_blocks_to_allocate <= num_free_blocks
+            for num_blocks_to_allocate, num_free_blocks in zip(
+                num_blocks_to_allocate_by_pool, num_free_blocks_by_pool
+            )
+        )
+
+    def _get_num_blocks_to_allocate_by_pool(
+        self,
+        request: Request,
+        num_tokens: int,
+        new_computed_block_list: tuple[Sequence[KVCacheBlock], ...],
+        num_encoder_tokens: int,
+        total_computed_tokens: int,
+        num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
+    ) -> tuple[int, ...]:
+        return self.coordinator.get_num_blocks_to_allocate_by_pool(
+            request_id=request.request_id,
+            num_tokens=num_tokens,
+            new_computed_blocks=new_computed_block_list,
+            num_encoder_tokens=num_encoder_tokens,
+            total_computed_tokens=total_computed_tokens,
+            num_tokens_main_model=num_tokens_main_model,
+            apply_admission_cap=apply_admission_cap,
+        )
+
     def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
@@ -336,16 +373,16 @@ def allocate_slots(
             # First check and fail if the full request sequence won't fit.
             full_num_tokens = min(request.num_tokens, self.max_model_len)
 
-            num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
-                request_id=request.request_id,
+            num_blocks_to_allocate_by_pool = self._get_num_blocks_to_allocate_by_pool(
+                request,
                 num_tokens=full_num_tokens,
-                new_computed_blocks=new_computed_block_list,
+                new_computed_block_list=new_computed_block_list,
                 num_encoder_tokens=num_encoder_tokens,
                 total_computed_tokens=total_computed_tokens,
                 num_tokens_main_model=full_num_tokens,
                 apply_admission_cap=True,
             )
-            if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+            if not self._has_enough_free_blocks_by_pool(num_blocks_to_allocate_by_pool):
                 return None
 
         num_tokens_main_model = total_computed_tokens + num_new_tokens
@@ -363,17 +400,17 @@ def allocate_slots(
             request.request_id, total_computed_tokens
         )
 
-        num_blocks_to_allocate = self.coordinator.get_num_blocks_to_allocate(
-            request_id=request.request_id,
+        num_blocks_to_allocate_by_pool = self._get_num_blocks_to_allocate_by_pool(
+            request,
             num_tokens=num_tokens_need_slot,
-            new_computed_blocks=new_computed_block_list,
+            new_computed_block_list=new_computed_block_list,
             num_encoder_tokens=num_encoder_tokens,
             total_computed_tokens=num_local_computed_tokens
             + num_external_computed_tokens,
             num_tokens_main_model=num_tokens_main_model,
         )
 
-        if num_blocks_to_allocate > self.block_pool.get_num_free_blocks():
+        if not self._has_enough_free_blocks_by_pool(num_blocks_to_allocate_by_pool):
             # Cannot allocate new blocks
             return None
 
@@ -444,7 +481,7 @@ def evict_blocks(self, block_ids: set[int]) -> None:
         Args:
             block_ids: Set of block IDs to evict from cache.
         """
-        self.block_pool.evict_blocks(block_ids)
+        self._cacheable_block_pool().evict_blocks(block_ids)
 
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
@@ -455,7 +492,7 @@ def reset_prefix_cache(self) -> bool:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        if not self.block_pool.reset_prefix_cache():
+        if not self._cacheable_block_pool().reset_prefix_cache():
             return False
         if self.log_stats:
             assert self.prefix_cache_stats is not None
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index b57e10b67faa..e0a69efc465c 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -23,9 +23,11 @@
     FullAttentionSpec,
     KVCacheConfig,
     KVCacheGroupSpec,
+    KVCachePoolConfig,
     KVCacheSpec,
     KVCacheTensor,
     MambaSpec,
+    MemoryModel,
     MLAAttentionSpec,
     SlidingWindowMLASpec,
     SlidingWindowSpec,
@@ -875,21 +877,96 @@ def get_max_concurrency_for_kv_cache_config(
     """
     Get the maximum concurrency for the given KV cache configuration.
     """
-    num_layer_per_group = max(
-        len(group.layer_names) for group in kv_cache_config.kv_cache_groups
-    )
+    if _has_request_constant_pools(kv_cache_config):
+        group_ids = [
+            group_id
+            for group_id, group in enumerate(kv_cache_config.kv_cache_groups)
+            if group.kv_cache_spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+        ]
+        if not group_ids:
+            return _get_request_constant_max_concurrency(kv_cache_config)
+        groups = [kv_cache_config.kv_cache_groups[group_id] for group_id in group_ids]
+        num_blocks = _get_num_blocks_for_group_ids(kv_cache_config, group_ids)
+    else:
+        groups = kv_cache_config.kv_cache_groups
+        num_blocks = kv_cache_config.num_blocks
+
+    num_layer_per_group = max(len(group.layer_names) for group in groups)
     max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes(
-        vllm_config, (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups)
-    )
-    memory_per_block = (
-        kv_cache_config.kv_cache_groups[0].kv_cache_spec.page_size_bytes
-        * num_layer_per_group
+        vllm_config, (group.kv_cache_spec for group in groups)
     )
+    memory_per_block = groups[0].kv_cache_spec.page_size_bytes * num_layer_per_group
     num_block_per_request = cdiv(max_memory_usage_per_request, memory_per_block)
-    max_concurrency = kv_cache_config.num_blocks / num_block_per_request
+    max_concurrency = num_blocks / num_block_per_request
     return max_concurrency
 
 
+def _has_request_constant_pools(kv_cache_config: KVCacheConfig) -> bool:
+    return any(
+        pool.memory_model == MemoryModel.REQUEST_CONSTANT
+        for pool in kv_cache_config.pool_configs
+    )
+
+
+def _get_request_constant_max_concurrency(
+    kv_cache_config: KVCacheConfig,
+) -> float:
+    pool_by_id = {pool.pool_id: pool for pool in kv_cache_config.pool_configs}
+    per_group_capacity = []
+    for group_id, group in enumerate(kv_cache_config.kv_cache_groups):
+        if group.kv_cache_spec.memory_model != MemoryModel.REQUEST_CONSTANT:
+            continue
+        pool_id = kv_cache_config.group_to_pool_id[group_id]
+        pool = pool_by_id[pool_id]
+        per_group_capacity.append(
+            (pool.num_blocks - 1) / group.kv_cache_spec.blocks_per_request
+        )
+    if not per_group_capacity:
+        return 0
+    return min(per_group_capacity)
+
+
+def _get_token_proportional_group_ids(
+    kv_cache_config: KVCacheConfig,
+) -> list[int]:
+    group_ids = [
+        group_id
+        for group_id, group in enumerate(kv_cache_config.kv_cache_groups)
+        if group.kv_cache_spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    ]
+    if not group_ids:
+        raise NotImplementedError(
+            "KV cache capacity reporting requires at least one "
+            "TOKEN_PROPORTIONAL group."
+        )
+    return group_ids
+
+
+def _get_num_blocks_for_group_ids(
+    kv_cache_config: KVCacheConfig,
+    group_ids: list[int],
+) -> int:
+    pool_ids = {kv_cache_config.group_to_pool_id[group_id] for group_id in group_ids}
+    return sum(
+        pool.num_blocks
+        for pool in kv_cache_config.pool_configs
+        if pool.pool_id in pool_ids
+    )
+
+
+def get_token_proportional_kv_cache_capacity_tokens(
+    kv_cache_config: KVCacheConfig,
+) -> int:
+    """Return token capacity represented by TOKEN_PROPORTIONAL pools only."""
+    token_group_ids = _get_token_proportional_group_ids(kv_cache_config)
+    min_block_size = min(
+        kv_cache_config.kv_cache_groups[group_id].kv_cache_spec.block_size
+        for group_id in token_group_ids
+    )
+    num_blocks = _get_num_blocks_for_group_ids(kv_cache_config, token_group_ids)
+    return num_blocks // len(token_group_ids) * min_block_size
+
+
 def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int:
     """
     Override the number of kv cache blocks if `num_gpu_blocks_override` is set.
@@ -1049,6 +1126,13 @@ def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bo
     return not kv_cache_spec
 
 
+def _has_request_constant_specs(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
+    return any(
+        spec.memory_model == MemoryModel.REQUEST_CONSTANT
+        for spec in kv_cache_spec.values()
+    )
+
+
 def _get_kv_cache_groups_uniform_page_size(
     kv_cache_spec: dict[str, KVCacheSpec],
 ) -> list[KVCacheGroupSpec]:
@@ -1228,6 +1312,163 @@ def _get_kv_cache_config_deepseek_v4(
     return num_blocks, kv_cache_tensors
 
 
+def _has_request_constant_groups(
+    kv_cache_groups: list[KVCacheGroupSpec],
+) -> bool:
+    return any(
+        group.kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT
+        for group in kv_cache_groups
+    )
+
+
+def _get_request_constant_num_blocks(
+    vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec
+) -> int:
+    blocks_per_request = kv_cache_spec.blocks_per_request
+    assert blocks_per_request > 0
+    max_num_seqs = vllm_config.scheduler_config.max_num_seqs
+    assert max_num_seqs > 0
+    return max_num_seqs * blocks_per_request + 1
+
+
+def _get_request_constant_reserved_bytes(
+    vllm_config: VllmConfig, kv_cache_groups: list[KVCacheGroupSpec]
+) -> int:
+    return sum(
+        _get_request_constant_num_blocks(vllm_config, group.kv_cache_spec)
+        * group.kv_cache_spec.physical_page_size_bytes
+        * len(group.layer_names)
+        for group in kv_cache_groups
+        if group.kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT
+    )
+
+
+def _get_legacy_num_blocks_from_pool_configs(
+    pool_configs: tuple[KVCachePoolConfig, ...],
+) -> int:
+    assert pool_configs
+    for pool in pool_configs:
+        if pool.memory_model == MemoryModel.TOKEN_PROPORTIONAL:
+            return pool.num_blocks
+    return pool_configs[0].num_blocks
+
+
+def _get_kv_cache_config_mixed_memory_model(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    available_memory: int,
+) -> KVCacheConfig:
+    token_group_ids = [
+        group_id
+        for group_id, group in enumerate(kv_cache_groups)
+        if group.kv_cache_spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    ]
+    request_constant_group_ids = [
+        group_id
+        for group_id, group in enumerate(kv_cache_groups)
+        if group.kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT
+    ]
+
+    kv_cache_tensors: list[KVCacheTensor] = []
+    pool_configs: list[KVCachePoolConfig] = []
+    group_to_pool_id = [-1] * len(kv_cache_groups)
+
+    request_constant_pool_specs: list[tuple[int, KVCachePoolConfig]] = []
+    reserved_bytes = _get_request_constant_reserved_bytes(vllm_config, kv_cache_groups)
+
+    # CUDA graph memory profiling temporarily sets num_gpu_blocks_override and
+    # asks for a minimal KV cache with available_memory=0.  The single-pool
+    # TOKEN_PROPORTIONAL path already honors that override after deriving an
+    # initial block count from available_memory.  Mirror that behavior here so
+    # mixed-memory configs do not fail before the token pool gets a chance to
+    # apply the override.  REQUEST_CONSTANT pool sizes remain deterministic.
+    override = vllm_config.cache_config.num_gpu_blocks_override
+    if override is not None:
+        if token_group_ids:
+            token_groups = [kv_cache_groups[group_id] for group_id in token_group_ids]
+            available_memory = max(
+                available_memory,
+                reserved_bytes + override * _pool_bytes_per_block(token_groups),
+            )
+        else:
+            available_memory = max(available_memory, reserved_bytes)
+
+    next_pool_id = 1 if token_group_ids else 0
+    for group_id in request_constant_group_ids:
+        group = kv_cache_groups[group_id]
+        spec = group.kv_cache_spec
+        num_blocks = _get_request_constant_num_blocks(vllm_config, spec)
+        pool_config = KVCachePoolConfig(
+            pool_id=next_pool_id,
+            memory_model=MemoryModel.REQUEST_CONSTANT,
+            group_ids=(group_id,),
+            num_blocks=num_blocks,
+            accounting_page_size_bytes=spec.accounting_page_size_bytes,
+            physical_page_size_bytes=spec.physical_page_size_bytes,
+        )
+        request_constant_pool_specs.append((group_id, pool_config))
+        next_pool_id += 1
+
+    reservation_exhausts_memory = (
+        reserved_bytes >= available_memory
+        if token_group_ids
+        else reserved_bytes > available_memory
+    )
+    if reservation_exhausts_memory:
+        raise ValueError(
+            "REQUEST_CONSTANT KV cache reservation "
+            f"({format_gib(reserved_bytes)} GiB) is not smaller than the "
+            f"available KV cache memory ({format_gib(available_memory)} GiB). "
+            "Try increasing `gpu_memory_utilization` or decreasing "
+            "`max_num_seqs`."
+        )
+
+    if token_group_ids:
+        token_groups = [kv_cache_groups[group_id] for group_id in token_group_ids]
+        token_kv_cache_config = get_kv_cache_config_from_groups(
+            vllm_config, token_groups, available_memory - reserved_bytes
+        )
+        if token_kv_cache_config.num_blocks <= 0:
+            raise ValueError(
+                "No available memory remains for TOKEN_PROPORTIONAL KV cache "
+                "blocks after REQUEST_CONSTANT KV cache reservation."
+            )
+        token_pool_config = token_kv_cache_config.pool_configs[0]
+        pool_configs.append(
+            replace(
+                token_pool_config,
+                pool_id=0,
+                group_ids=tuple(token_group_ids),
+            )
+        )
+        for group_id in token_group_ids:
+            group_to_pool_id[group_id] = 0
+        kv_cache_tensors.extend(token_kv_cache_config.kv_cache_tensors)
+
+    for group_id, pool_config in request_constant_pool_specs:
+        group = kv_cache_groups[group_id]
+        spec = group.kv_cache_spec
+        group_to_pool_id[group_id] = pool_config.pool_id
+        pool_configs.append(pool_config)
+        for layer_name in group.layer_names:
+            kv_cache_tensors.append(
+                KVCacheTensor(
+                    size=pool_config.num_blocks * spec.physical_page_size_bytes,
+                    shared_by=[layer_name],
+                )
+            )
+
+    assert all(pool_id >= 0 for pool_id in group_to_pool_id)
+    pool_config_tuple = tuple(pool_configs)
+    return KVCacheConfig(
+        num_blocks=_get_legacy_num_blocks_from_pool_configs(pool_config_tuple),
+        kv_cache_tensors=kv_cache_tensors,
+        kv_cache_groups=kv_cache_groups,
+        pool_configs=pool_config_tuple,
+        group_to_pool_id=tuple(group_to_pool_id),
+    )
+
+
 def get_kv_cache_config_from_groups(
     vllm_config: VllmConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
@@ -1253,6 +1494,17 @@ def get_kv_cache_config_from_groups(
             kv_cache_groups=kv_cache_groups,
         )
 
+    if _has_request_constant_groups(kv_cache_groups):
+        if vllm_config.cache_config.enable_prefix_caching:
+            raise NotImplementedError(
+                "Prefix caching with REQUEST_CONSTANT groups is not yet "
+                "supported. Either disable prefix caching or use only "
+                "TOKEN_PROPORTIONAL specs."
+            )
+        return _get_kv_cache_config_mixed_memory_model(
+            vllm_config, kv_cache_groups, available_memory
+        )
+
     # Determine how model runners should initialize the KV cache tensors.
     if len(kv_cache_groups) == 1 and isinstance(
         kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
@@ -1610,27 +1862,12 @@ def _annotate_eagle_groups_deepseek_v4(
             break
 
 
-def get_kv_cache_groups(
+def _get_token_proportional_kv_cache_groups(
     vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec]
 ) -> list[KVCacheGroupSpec]:
-    """
-    Split the layers in the model into groups with the same KV cache spec.
-
-    Args:
-        vllm_config: The global VllmConfig
-        kv_cache_spec: The kv cache spec of each attention layer in the model
-
-    Returns:
-        The generated KVCacheGroups
-    """
     if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
         unify_hybrid_kv_cache_specs(kv_cache_spec)
 
-    if is_kv_cache_type_attention_free(kv_cache_spec):
-        # This returns an empty list to allow for the KVCacheManager to handle
-        # attention free models.
-        return []
-
     if is_kv_cache_spec_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
@@ -1661,6 +1898,68 @@ def get_kv_cache_groups(
     return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
 
+def _get_request_constant_kv_cache_groups(
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> list[KVCacheGroupSpec]:
+    """Group request-constant specs without page-size unification."""
+    same_spec_layers: dict[KVCacheSpec, list[str]] = defaultdict(list)
+    for layer_name, layer_spec in kv_cache_spec.items():
+        same_spec_layers[layer_spec].append(layer_name)
+    return create_kv_cache_group_specs(kv_cache_spec, list(same_spec_layers.values()))
+
+
+def _get_memory_model_aware_kv_cache_groups(
+    vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec]
+) -> list[KVCacheGroupSpec]:
+    token_proportional_specs = {
+        layer_name: spec
+        for layer_name, spec in kv_cache_spec.items()
+        if spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+    }
+    request_constant_specs = {
+        layer_name: spec
+        for layer_name, spec in kv_cache_spec.items()
+        if spec.memory_model == MemoryModel.REQUEST_CONSTANT
+    }
+
+    kv_cache_groups: list[KVCacheGroupSpec] = []
+    if token_proportional_specs:
+        kv_cache_groups.extend(
+            _get_token_proportional_kv_cache_groups(
+                vllm_config, token_proportional_specs
+            )
+        )
+    if request_constant_specs:
+        kv_cache_groups.extend(
+            _get_request_constant_kv_cache_groups(request_constant_specs)
+        )
+    return kv_cache_groups
+
+
+def get_kv_cache_groups(
+    vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec]
+) -> list[KVCacheGroupSpec]:
+    """
+    Split the layers in the model into groups with the same KV cache spec.
+
+    Args:
+        vllm_config: The global VllmConfig
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+
+    Returns:
+        The generated KVCacheGroups
+    """
+    if is_kv_cache_type_attention_free(kv_cache_spec):
+        # This returns an empty list to allow for the KVCacheManager to handle
+        # attention free models.
+        return []
+
+    if _has_request_constant_specs(kv_cache_spec):
+        return _get_memory_model_aware_kv_cache_groups(vllm_config, kv_cache_spec)
+
+    return _get_token_proportional_kv_cache_groups(vllm_config, kv_cache_spec)
+
+
 def generate_scheduler_kv_cache_config(
     kv_cache_configs: list[KVCacheConfig],
 ) -> KVCacheConfig:
@@ -1670,6 +1969,26 @@ def generate_scheduler_kv_cache_config(
     assert all(
         [cfg.num_blocks == kv_cache_configs[0].num_blocks for cfg in kv_cache_configs]
     )
+    assert all(
+        [
+            cfg.group_to_pool_id == kv_cache_configs[0].group_to_pool_id
+            for cfg in kv_cache_configs
+        ]
+    )
+    assert all(
+        [
+            _get_pool_config_structure(cfg)
+            == _get_pool_config_structure(kv_cache_configs[0])
+            for cfg in kv_cache_configs
+        ]
+    )
+    assert all(
+        [
+            tuple(pool.num_blocks for pool in cfg.pool_configs)
+            == tuple(pool.num_blocks for pool in kv_cache_configs[0].pool_configs)
+            for cfg in kv_cache_configs
+        ]
+    )
     # All workers have the same kv_cache_config except layer names, so use
     # an arbitrary one to initialize the scheduler.
     cfg = copy.deepcopy(kv_cache_configs[0])
@@ -1680,9 +1999,23 @@ def generate_scheduler_kv_cache_config(
             group.kv_cache_spec = next(
                 iter(group.kv_cache_spec.kv_cache_specs.values())
             )
+    cfg.refresh_legacy_pool_metadata()
     return cfg
 
 
+def _get_pool_config_structure(kv_cache_config: KVCacheConfig):
+    return tuple(
+        (
+            pool.pool_id,
+            pool.memory_model,
+            pool.group_ids,
+            pool.accounting_page_size_bytes,
+            pool.physical_page_size_bytes,
+        )
+        for pool in kv_cache_config.pool_configs
+    )
+
+
 def _report_kv_cache_config(
     vllm_config: VllmConfig, kv_cache_config: KVCacheConfig
 ) -> None:
@@ -1729,6 +2062,22 @@ def _max_memory_usage_bytes_from_groups(
     if not kv_cache_groups:
         return 0
 
+    if _has_request_constant_groups(kv_cache_groups):
+        request_constant_memory = _get_request_constant_reserved_bytes(
+            vllm_config, kv_cache_groups
+        )
+        token_proportional_groups = [
+            group
+            for group in kv_cache_groups
+            if group.kv_cache_spec.memory_model == MemoryModel.TOKEN_PROPORTIONAL
+        ]
+        token_proportional_memory = (
+            _max_memory_usage_bytes_from_groups(vllm_config, token_proportional_groups)
+            if token_proportional_groups
+            else 0
+        )
+        return request_constant_memory + token_proportional_memory
+
     if len(kv_cache_groups) == 1 and isinstance(
         kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
     ):
@@ -1919,6 +2268,81 @@ def _project_kv_cache_groups_to_worker(
     return projected_groups
 
 
+def _get_tensor_pool_id(
+    kv_cache_config: KVCacheConfig,
+    tensor: KVCacheTensor,
+) -> int:
+    layer_to_group_id = {
+        layer_name: group_id
+        for group_id, group in enumerate(kv_cache_config.kv_cache_groups)
+        for layer_name in group.layer_names
+    }
+    pool_ids = {
+        kv_cache_config.group_to_pool_id[layer_to_group_id[layer_name]]
+        for layer_name in tensor.shared_by
+    }
+    assert len(pool_ids) == 1
+    return pool_ids.pop()
+
+
+def _normalize_kv_cache_config_num_blocks(
+    kv_cache_configs: list[KVCacheConfig],
+) -> None:
+    if not kv_cache_configs:
+        return
+    if not kv_cache_configs[0].pool_configs:
+        min_num_blocks = min(
+            kv_cache_config.num_blocks for kv_cache_config in kv_cache_configs
+        )
+        for kv_cache_config in kv_cache_configs:
+            kv_cache_config.num_blocks = min_num_blocks
+            kv_cache_config.refresh_legacy_pool_metadata()
+        return
+
+    num_pools = len(kv_cache_configs[0].pool_configs)
+    assert all(len(config.pool_configs) == num_pools for config in kv_cache_configs)
+
+    normalized_pool_num_blocks: list[int] = []
+    for pool_id in range(num_pools):
+        memory_model = kv_cache_configs[0].pool_configs[pool_id].memory_model
+        assert all(
+            config.pool_configs[pool_id].memory_model == memory_model
+            for config in kv_cache_configs
+        )
+        pool_num_blocks = [
+            config.pool_configs[pool_id].num_blocks for config in kv_cache_configs
+        ]
+        if memory_model == MemoryModel.TOKEN_PROPORTIONAL:
+            normalized_pool_num_blocks.append(min(pool_num_blocks))
+        else:
+            assert len(set(pool_num_blocks)) == 1
+            normalized_pool_num_blocks.append(pool_num_blocks[0])
+
+    for kv_cache_config in kv_cache_configs:
+        old_pool_num_blocks = tuple(
+            pool.num_blocks for pool in kv_cache_config.pool_configs
+        )
+        kv_cache_config.pool_configs = tuple(
+            replace(pool, num_blocks=normalized_pool_num_blocks[pool.pool_id])
+            for pool in kv_cache_config.pool_configs
+        )
+        kv_cache_config.num_blocks = _get_legacy_num_blocks_from_pool_configs(
+            kv_cache_config.pool_configs
+        )
+
+        for tensor in kv_cache_config.kv_cache_tensors:
+            pool_id = _get_tensor_pool_id(kv_cache_config, tensor)
+            old_num_blocks = old_pool_num_blocks[pool_id]
+            new_num_blocks = normalized_pool_num_blocks[pool_id]
+            assert old_num_blocks > 0, (
+                "KV cache pool num_blocks includes the null block and must be positive."
+            )
+            if old_num_blocks == new_num_blocks:
+                continue
+            assert tensor.size % old_num_blocks == 0
+            tensor.size = tensor.size // old_num_blocks * new_num_blocks
+
+
 def get_kv_cache_configs(
     vllm_config: VllmConfig,
     kv_cache_specs: list[dict[str, KVCacheSpec]],
@@ -1994,13 +2418,29 @@ def get_kv_cache_configs(
             if not groups:
                 adjusted_memory.append(avail_mem)
                 continue
-            bytes_per_block = _pool_bytes_per_block(groups)
+            if _has_request_constant_groups(groups):
+                token_groups = [
+                    group
+                    for group in groups
+                    if group.kv_cache_spec.memory_model
+                    == MemoryModel.TOKEN_PROPORTIONAL
+                ]
+                request_constant_bytes = _get_request_constant_reserved_bytes(
+                    vllm_config, groups
+                )
+                bytes_per_block = (
+                    _pool_bytes_per_block(token_groups) if token_groups else 0
+                )
+            else:
+                request_constant_bytes = 0
+                bytes_per_block = _pool_bytes_per_block(groups)
+            profiled_blocks = avail_mem // bytes_per_block if bytes_per_block else 0
             logger.info(
                 "Overriding num_gpu_blocks=%d with num_gpu_blocks_override=%d",
-                avail_mem // bytes_per_block,
+                profiled_blocks,
                 override,
             )
-            adjusted_memory.append(override * bytes_per_block)
+            adjusted_memory.append(request_constant_bytes + override * bytes_per_block)
         available_memory = adjusted_memory
 
     if vllm_config.model_config.original_max_model_len == -1:
@@ -2032,21 +2472,11 @@ def get_kv_cache_configs(
             )
         )
 
-    # Change the num_blocks of each rank to the smallest among all ranks.
-    # We also need to shrink the tensor size proportionally to avoid
-    # allocating unused memory.
-    min_num_blocks = min(
-        kv_cache_config.num_blocks for kv_cache_config in kv_cache_configs
-    )
+    # Change token-proportional num_blocks of each rank to the smallest among
+    # all ranks. Request-constant pool sizes are deterministic from
+    # max_num_seqs and are asserted equal rather than normalized.
+    _normalize_kv_cache_config_num_blocks(kv_cache_configs)
     for kv_cache_config in kv_cache_configs:
-        num_blocks_old = kv_cache_config.num_blocks
-        kv_cache_config.num_blocks = min_num_blocks
-
-        # Shrink tensor size proportionally
-        for tensor in kv_cache_config.kv_cache_tensors:
-            assert tensor.size % num_blocks_old == 0
-            tensor.size = tensor.size // num_blocks_old * min_num_blocks
-
         if len(kv_cache_config.kv_cache_groups) > 0:
             _report_kv_cache_config(vllm_config, kv_cache_config)
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 032767cdf3b0..8aaeb3970079 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,8 +7,6 @@
 from dataclasses import replace
 from typing import Any
 
-import numpy as np
-
 from vllm import envs
 from vllm.compilation.cuda_graph import CUDAGraphStat
 from vllm.config import VllmConfig
@@ -27,9 +25,6 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
-    RoutedExpertsReader,
-)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.encoder_budget import MultiModalBudget
 from vllm.v1.core.encoder_cache_manager import (
@@ -52,7 +47,7 @@
 )
 from vllm.v1.core.sched.utils import check_stop, remove_all
 from vllm.v1.engine import EngineCoreEventType, EngineCoreOutput, EngineCoreOutputs
-from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.perf import ModelMetrics, PerfStats
 from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
 from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
@@ -260,43 +255,6 @@ def __init__(
         if self.log_stats and vllm_config.observability_config.enable_mfu_metrics:
             self.perf_metrics = ModelMetrics(vllm_config)
 
-        if self.vllm_config.model_config.enable_return_routed_experts:
-            assert self.dcp_world_size == 1 and self.pcp_world_size == 1, (
-                "enable_return_routed_experts does not support context parallelism "
-                "(dcp_world_size > 1 or pcp_world_size > 1)"
-            )
-
-            self.routed_experts_reader = RoutedExpertsReader.create()
-
-            assert len(kv_cache_config.kv_cache_groups) > 0, (
-                "enable_return_routed_experts requires at least one kv cache group"
-            )
-            # Find the attention group for routed experts indexing.
-            self.routed_experts_attn_gid = 0
-            for gid, group in enumerate(kv_cache_config.kv_cache_groups):
-                if isinstance(group.kv_cache_spec, AttentionSpec):
-                    self.routed_experts_attn_gid = gid
-                    break
-            min_block_size = min(
-                [
-                    group.kv_cache_spec.block_size
-                    for group in kv_cache_config.kv_cache_groups
-                ]
-            )
-            num_groups = len(kv_cache_config.kv_cache_groups)
-            self.max_num_kv_tokens = (
-                kv_cache_config.num_blocks // num_groups
-            ) * min_block_size
-            dcp_size = self.vllm_config.parallel_config.decode_context_parallel_size
-            pcp_size = self.vllm_config.parallel_config.prefill_context_parallel_size
-            if pcp_size * dcp_size > 1:
-                self.max_num_kv_tokens *= pcp_size * dcp_size
-
-            self.routed_experts_reader.attach_buffer(
-                max_num_kv_tokens=self.max_num_kv_tokens,
-                vllm_config=self.vllm_config,
-            )
-
         self._pause_state: PauseState = PauseState.UNPAUSED
 
     def _mamba_block_aligned_split(
@@ -1415,11 +1373,15 @@ def update_from_output(
                     request.resumable = False
                     stopped = True
 
+            # Get routing data from ModelRunnerOutput (via worker D2H pipeline)
             routed_experts = None
+            if (
+                model_runner_output.routed_experts_dict is not None
+                and req_id in model_runner_output.routed_experts_dict
+            ):
+                routed_experts = model_runner_output.routed_experts_dict[req_id]
             finish_reason = None
             if stopped:
-                routed_experts = self._get_routed_experts(request)
-
                 # Capture finish_reason BEFORE _handle_stopped_request, which may
                 # reset the status to WAITING for streaming requests that continue.
                 finish_reason = request.get_finished_reason()
@@ -1594,31 +1556,6 @@ def _handle_stopped_request(self, request: Request) -> bool:
         self._enqueue_waiting_request(request)
         return False
 
-    def _get_routed_experts(self, request: Request) -> np.ndarray | None:
-        if not self.vllm_config.model_config.enable_return_routed_experts:
-            return None
-
-        kv_blocks = self.kv_cache_manager.get_blocks(request.request_id)
-        block_ids = kv_blocks.get_block_ids()[self.routed_experts_attn_gid]
-        num_tokens = request.num_tokens - 1
-
-        # compute slot mapping using attention group's block_size
-        block_ids_array = np.array(block_ids, dtype=np.int32)
-        num_blocks = len(block_ids)
-        attn_group = self.kv_cache_config.kv_cache_groups[self.routed_experts_attn_gid]
-        block_size = attn_group.kv_cache_spec.block_size
-
-        # generate block offsets
-        block_offsets = np.arange(0, block_size)
-
-        # compute slot mapping: slot = block_id * block_size + offset
-        slot_mapping = (
-            block_offsets.reshape((1, block_size))
-            + block_ids_array.reshape((num_blocks, 1)) * block_size
-        ).flatten()[:num_tokens]
-
-        return self.routed_experts_reader.get_routed_experts(indices=slot_mapping)
-
     def _update_request_with_output(
         self, request: Request, new_token_ids: list[int]
     ) -> tuple[list[int], bool]:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index e8d3a6f75688..ce9d36221496 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -4,9 +4,14 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import Sequence
+from typing import cast
 
 from vllm.utils.math_utils import cdiv
-from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.block_pool import (
+    BlockPool,
+    BlockPoolProtocol,
+    CacheableBlockPoolProtocol,
+)
 from vllm.v1.core.kv_cache_utils import (
     BlockHashList,
     BlockHashWithGroupId,
@@ -18,6 +23,7 @@
     FullAttentionSpec,
     KVCacheSpec,
     MambaSpec,
+    MemoryModel,
     MLAAttentionSpec,
     SinkFullAttentionSpec,
     SlidingWindowMLASpec,
@@ -36,7 +42,7 @@ class SingleTypeKVCacheManager(ABC):
     def __init__(
         self,
         kv_cache_spec: KVCacheSpec,
-        block_pool: BlockPool,
+        block_pool: BlockPoolProtocol,
         enable_caching: bool,
         kv_cache_group_id: int,
         dcp_world_size: int = 1,
@@ -81,6 +87,26 @@ def __init__(
         self.kv_cache_group_id = kv_cache_group_id
         self._null_block = block_pool.null_block
 
+    def _should_record_new_block_ids_for_zeroing(self) -> bool:
+        """Return whether newly allocated blocks should be zeroed.
+
+        This preserves the legacy behavior exactly: only full-attention style
+        KV blocks are zeroed by the GPU worker. The broader
+        ``requires_block_zeroing_on_alloc`` spec property is not wired here yet
+        because the current zeroing kernel only targets attention KV tensors.
+        """
+        return type(self.kv_cache_spec) in (FullAttentionSpec, TQFullAttentionSpec)
+
+    def _cacheable_block_pool(self) -> CacheableBlockPoolProtocol:
+        """Return the block pool as a prefix-cache-capable pool.
+
+        Compact/request-constant pools only implement the allocation protocol.
+        Prefix-cache paths are guarded by ``enable_caching`` and must only run
+        with a cacheable pool.
+        """
+        assert self.enable_caching
+        return cast(CacheableBlockPoolProtocol, self.block_pool)
+
     @classmethod
     def _get_num_evictable_blocks(cls, blocks: Sequence[KVCacheBlock]):
         return sum(blk.ref_cnt == 0 and not blk.is_null for blk in blocks)
@@ -215,7 +241,7 @@ def allocate_new_computed_blocks(
 
         # Touch the computed blocks to make sure they won't be evicted.
         if self.enable_caching:
-            self.block_pool.touch(new_computed_blocks)
+            self._cacheable_block_pool().touch(new_computed_blocks)
         else:
             assert not any(new_computed_blocks), (
                 "Computed blocks should be empty when prefix caching is disabled"
@@ -236,7 +262,7 @@ def allocate_new_computed_blocks(
                 cdiv(num_total_computed_tokens, self.block_size) - len(req_blocks)
             )
             req_blocks.extend(allocated_blocks)
-            if type(self.kv_cache_spec) in (FullAttentionSpec, TQFullAttentionSpec):
+            if self._should_record_new_block_ids_for_zeroing():
                 self.new_block_ids.extend(b.block_id for b in allocated_blocks)
 
     def allocate_new_blocks(
@@ -264,7 +290,7 @@ def allocate_new_blocks(
         else:
             new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
-            if type(self.kv_cache_spec) in (FullAttentionSpec, TQFullAttentionSpec):
+            if self._should_record_new_block_ids_for_zeroing():
                 self.new_block_ids.extend(b.block_id for b in new_blocks)
             return new_blocks
 
@@ -289,7 +315,7 @@ def cache_blocks(self, request: Request, num_tokens: int) -> None:
         if num_cached_blocks >= num_full_blocks:
             return
 
-        self.block_pool.cache_full_blocks(
+        self._cacheable_block_pool().cache_full_blocks(
             request=request,
             blocks=self.req_to_blocks[request.request_id],
             num_cached_blocks=num_cached_blocks,
@@ -340,7 +366,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -450,7 +476,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -515,7 +541,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -652,7 +678,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -793,12 +819,17 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
 
 class MambaManager(SingleTypeKVCacheManager):
     def __init__(
-        self, kv_cache_spec: MambaSpec, block_pool: BlockPool, **kwargs
+        self, kv_cache_spec: MambaSpec, block_pool: BlockPoolProtocol, **kwargs
     ) -> None:
         super().__init__(kv_cache_spec, block_pool, **kwargs)
         self.cached_blocks_this_step: set[BlockHashWithGroupId] = set()
         self.mamba_cache_mode = kv_cache_spec.mamba_cache_mode
         self.num_speculative_blocks: int = kv_cache_spec.num_speculative_blocks
+        self.is_request_constant = (
+            kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT
+        )
+        if self.is_request_constant:
+            assert not self.enable_caching
         if self.mamba_cache_mode == "align":
             # Mapping from request ID to the index of the block
             # allocated in the previous step
@@ -812,7 +843,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -864,6 +895,13 @@ def remove_skipped_blocks(self, request_id: str, num_computed_tokens: int) -> No
         # that we might actually need.
         num_computed_tokens = max(0, num_computed_tokens - self.num_speculative_blocks)
 
+        if self.is_request_constant and self.mamba_cache_mode != "align":
+            # Non-align compact Mamba keeps exactly one current state block
+            # plus optional speculative state blocks per request. Token-window
+            # based skipped-block eviction would incorrectly free that compact
+            # state because its block table is not token-proportional.
+            return
+
         super().remove_skipped_blocks(request_id, num_computed_tokens)
         if self.mamba_cache_mode == "align":
             # `last_state_block_idx` refers to the block index allocated two steps ago.
@@ -890,6 +928,13 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
         """
         return 0
 
+    def _get_request_constant_num_blocks_to_allocate(self, request_id: str) -> int:
+        assert isinstance(self.kv_cache_spec, MambaSpec)
+        held_blocks = sum(
+            not block.is_null for block in self.req_to_blocks.get(request_id, ())
+        )
+        return max(self.kv_cache_spec.blocks_per_request - held_blocks, 0)
+
     def get_num_blocks_to_allocate(
         self,
         request_id: str,
@@ -909,6 +954,9 @@ def get_num_blocks_to_allocate(
             # that kv_cache_manager will think there is no enough blocks to allocate now
             # and don't schedule it in the current step.
             return self.block_pool.num_gpu_blocks + 1
+        if self.is_request_constant and self.mamba_cache_mode != "align":
+            assert len(new_computed_blocks) == 0
+            return self._get_request_constant_num_blocks_to_allocate(request_id)
         if self.mamba_cache_mode != "align":
             # Allocate extra `num_speculative_blocks` blocks for
             # speculative decoding (MTP/EAGLE) with linear attention.
@@ -961,6 +1009,16 @@ def allocate_new_blocks(
         self, request_id: str, num_tokens: int, num_tokens_main_model: int
     ) -> list[KVCacheBlock]:
         assert isinstance(self.kv_cache_spec, MambaSpec)
+        if self.is_request_constant and self.mamba_cache_mode != "align":
+            compact_req_blocks: list[KVCacheBlock] = self.req_to_blocks[request_id]
+            num_new_blocks = self._get_request_constant_num_blocks_to_allocate(
+                request_id
+            )
+            if num_new_blocks <= 0:
+                return []
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
+            compact_req_blocks.extend(new_blocks)
+            return new_blocks
         if self.mamba_cache_mode != "align":
             # Allocate extra `num_speculative_blocks` blocks for
             # speculative decoding (MTP/EAGLE) with linear attention.
@@ -1039,6 +1097,16 @@ def free(self, request_id: str) -> None:
         if self.mamba_cache_mode == "align":
             self._allocated_block_reqs.discard(request_id)
             self.last_state_block_idx.pop(request_id, None)
+        if self.is_request_constant:
+            # CompactBlockPool rejects freeing the null sentinel. Align-mode
+            # request block tables can contain null padding, so filter them
+            # before returning real compact blocks to the pool.
+            req_blocks = self.req_to_blocks.pop(request_id, [])
+            self.block_pool.free_blocks(
+                block for block in reversed(req_blocks) if not block.is_null
+            )
+            self.num_cached_block.pop(request_id, None)
+            return
         super().free(request_id)
 
     def get_num_skipped_tokens(self, num_computed_tokens: int) -> int:
@@ -1096,7 +1164,7 @@ def find_longest_cache_hit(
         block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
-        block_pool: BlockPool,
+        block_pool: CacheableBlockPoolProtocol,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
         alignment_tokens: int,
@@ -1136,7 +1204,7 @@ def __init__(
         sink_len = kv_cache_spec.sink_len
         assert sink_len is not None and sink_len > 0 and sink_len % self.block_size == 0
         num_sink_block = sink_len // self.block_size
-        self.sink_blocks = self.block_pool.free_block_queue.popleft_n(num_sink_block)
+        self.sink_blocks = block_pool.free_block_queue.popleft_n(num_sink_block)
 
 
 spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = {
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2f81ba4f6c78..4700eecb59a7 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -3,9 +3,9 @@
 from abc import ABC, abstractmethod
 
 import tokenizers
+import tokenizers.decoders
 from packaging import version
 from tokenizers import Tokenizer
-from tokenizers.decoders import DecodeStream
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
@@ -177,7 +177,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerFast, request: EngineCoreReques
         self.tokenizer: Tokenizer = tokenizer._tokenizer
 
         # Use native prefill to prime the decode stream with prompt tokens.
-        self.stream = DecodeStream(
+        # Look up DecodeStream on the module so backend patches (e.g. the
+        # fastokens shim that replaces ``tokenizers.decoders.DecodeStream``)
+        # are honored regardless of import order.
+        self.stream = tokenizers.decoders.DecodeStream(
             ids=request.prompt_token_ids,
             skip_special_tokens=self.skip_special_tokens,
         )
@@ -237,7 +240,9 @@ def _protected_step(self, next_token_id: int) -> str | None:
                 " for request %s, resetting decode stream.",
                 self.request_id,
             )
-            self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens)
+            self.stream = tokenizers.decoders.DecodeStream(
+                skip_special_tokens=self.skip_special_tokens
+            )
             token = self.stream.step(self.tokenizer, next_token_id)
         return token
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 1ae89ae19680..e0e53694c400 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -11,6 +11,9 @@
 import torch
 
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+    split_routed_experts,
+)
 from vllm.outputs import (
     STREAM_FINISHED,
     CompletionOutput,
@@ -314,8 +317,24 @@ def make_request_output(
                 finished,
             )
 
+        # Split routing data into prompt and generation portions.
+        # Prompt routing lives on RequestOutput (shared across n>1
+        # completions); generation routing lives on each CompletionOutput.
+        prompt_routed_experts = None
+        gen_routed_experts = None
+        if routed_experts is not None:
+            prompt_len = len(self.prompt_token_ids) if self.prompt_token_ids else 0
+            num_gen = (
+                self.detokenizer.num_output_tokens()
+                if self.detokenizer is not None
+                else None
+            )
+            prompt_routed_experts, gen_routed_experts = split_routed_experts(
+                routed_experts, prompt_len, num_gen
+            )
+
         output = self._new_completion_output(
-            new_token_ids, finish_reason, stop_reason, routed_experts
+            new_token_ids, finish_reason, stop_reason, gen_routed_experts
         )
 
         if self.parent_req is None:
@@ -327,7 +346,11 @@ def make_request_output(
             external_req_id = self.parent_req.external_req_id
 
         return self._new_request_output(
-            external_req_id, outputs, finished, kv_transfer_params
+            external_req_id,
+            outputs,
+            finished,
+            kv_transfer_params,
+            prompt_routed_experts,
         )
 
     def _new_request_output(
@@ -336,6 +359,7 @@ def _new_request_output(
         outputs: list[CompletionOutput] | list[PoolingOutput],
         finished: bool,
         kv_transfer_params: dict[str, Any] | None = None,
+        prompt_routed_experts: np.ndarray | None = None,
     ) -> RequestOutput | PoolingRequestOutput:
         # If prompt embeds were used, put placeholder prompt token ids
         prompt_token_ids = self.prompt_token_ids
@@ -371,6 +395,7 @@ def _new_request_output(
             kv_transfer_params=kv_transfer_params,
             num_cached_tokens=self.num_cached_tokens,
             metrics=self.stats,
+            prompt_routed_experts=prompt_routed_experts,
         )
 
     def _new_completion_output(
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 7b0f00d14c8a..1f0b9bbb19d5 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -403,6 +403,11 @@ def __init__(
             range(dp_size), local_dp_ranks, placement_groups
         ):
             dp_vllm_config = copy.deepcopy(vllm_config)
+            if dp_size > 1:
+                # Append the DP rank to instance_id so that per-engine
+                # identifiers (e.g. Ray actor names in RayExecutorV2) are
+                # unique across DP replicas.
+                dp_vllm_config.instance_id = f"{dp_vllm_config.instance_id}_dp{index}"
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
 
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index d006946079e7..92e668406f98 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 from collections.abc import Callable
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import Future
 from functools import cached_property
 from multiprocessing import Lock
 from typing import Any
@@ -23,6 +23,25 @@
 logger = init_logger(__name__)
 
 
+class AsyncOutputFuture(Future):
+    def __init__(self, async_output: AsyncModelRunnerOutput, single_value: bool):
+        self.async_output = async_output
+        self.single_value = single_value
+        super().__init__()
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise RuntimeError("timeout not implemented")
+
+        if not super().done():
+            try:
+                output = self.async_output.get_output()
+                self.set_result(output if self.single_value else [output])
+            except Exception as e:
+                self.set_exception(e)
+        return super().result()
+
+
 class UniProcExecutor(Executor):
     def _init_executor(self) -> None:
         """Initialize the worker and load the model."""
@@ -37,12 +56,6 @@ def _init_executor(self) -> None:
             shared_worker_lock=Lock(),
         )
 
-        self.async_output_thread: ThreadPoolExecutor | None = None
-        if self.max_concurrent_batches > 1:
-            self.async_output_thread = ThreadPoolExecutor(
-                max_workers=1, thread_name_prefix="WorkerAsyncOutput"
-            )
-
         self.driver_worker.init_worker(all_kwargs=[kwargs])
         self.driver_worker.init_device()
 
@@ -83,15 +96,7 @@ def collective_rpc(  # type: ignore[override]
         try:
             result = run_method(self.driver_worker, method, args, kwargs)
             if isinstance(result, AsyncModelRunnerOutput):
-                if (async_thread := self.async_output_thread) is not None:
-                    if single_value:
-                        return async_thread.submit(result.get_output)
-
-                    def get_output_list() -> list[Any]:
-                        return [result.get_output()]
-
-                    return async_thread.submit(get_output_list)
-                result = result.get_output()
+                return AsyncOutputFuture(result, single_value)
             future = Future[Any]()
             future.set_result(result if single_value else [result])
         except Exception as e:
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 19438fb1e42d..d36620f7106b 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -6,7 +6,7 @@
 import copy
 from collections import Counter
 from dataclasses import dataclass, fields, replace
-from enum import IntEnum
+from enum import Enum, IntEnum
 from math import prod
 from typing import TYPE_CHECKING
 
@@ -77,6 +77,13 @@ def kv_cache_uses_per_token_head_scales(kv_cache_dtype: str) -> bool:
     return get_kv_quant_mode(kv_cache_dtype).is_per_token_head
 
 
+class MemoryModel(Enum):
+    """How a KV cache spec's memory scales with request properties."""
+
+    TOKEN_PROPORTIONAL = "token_proportional"
+    REQUEST_CONSTANT = "request_constant"
+
+
 @dataclass(frozen=True)
 class KVCacheSpec:
     """
@@ -100,6 +107,34 @@ def page_size_bytes(self) -> int:
     def storage_block_size(self) -> int:
         return self.block_size
 
+    @property
+    def memory_model(self) -> MemoryModel:
+        """How memory usage scales for this KV cache spec."""
+        return MemoryModel.TOKEN_PROPORTIONAL
+
+    @property
+    def blocks_per_request(self) -> int:
+        """Maximum compact-pool blocks held by one request.
+
+        This is only meaningful for ``REQUEST_CONSTANT`` specs.
+        """
+        return 1
+
+    @property
+    def accounting_page_size_bytes(self) -> int:
+        """Bytes used for allocator accounting."""
+        return self.page_size_bytes
+
+    @property
+    def physical_page_size_bytes(self) -> int:
+        """Bytes the backing cache tensor physically stores per page."""
+        return self.page_size_bytes
+
+    @property
+    def requires_block_zeroing_on_alloc(self) -> bool:
+        """Whether reused blocks must be zeroed before re-allocation."""
+        return True
+
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         """
         The maximum possible memory usage of this KV cache in bytes.
@@ -538,20 +573,48 @@ class MambaSpec(KVCacheSpec):
 
     @property
     def page_size_bytes(self) -> int:
-        page_size = sum(
-            prod(shape) * get_dtype_size(dtype)
-            for (shape, dtype) in zip(self.shapes, self.dtypes)
-        )
+        page_size = self.physical_page_size_bytes
         if self.page_size_padded is not None:
             assert self.page_size_padded >= page_size
             return self.page_size_padded
         return page_size
 
+    @property
+    def physical_page_size_bytes(self) -> int:
+        return sum(
+            prod(shape) * get_dtype_size(dtype)
+            for (shape, dtype) in zip(self.shapes, self.dtypes)
+        )
+
+    @property
+    def memory_model(self) -> MemoryModel:
+        # "all" mode preserves the legacy token-proportional shared-pool path,
+        # including prefix-caching compatibility. Other modes store compact
+        # O(1)-per-request state.
+        if self.mamba_cache_mode == "all":
+            return MemoryModel.TOKEN_PROPORTIONAL
+        return MemoryModel.REQUEST_CONSTANT
+
+    @property
+    def blocks_per_request(self) -> int:
+        # "align" may transiently hold previous + current state blocks during a
+        # state-block transition. "none" holds exactly one state block.
+        # Speculative decoding adds one compact slot per speculative block.
+        if self.mamba_cache_mode == "align":
+            return 2 + self.num_speculative_blocks
+        return 1 + self.num_speculative_blocks
+
+    @property
+    def requires_block_zeroing_on_alloc(self) -> bool:
+        # The current KVBlockZeroer skips Mamba tensors in all modes. Mamba
+        # state isolation is handled by the kernel/state-copy path instead.
+        return False
+
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
-        if vllm_config.cache_config.mamba_cache_mode == "all":
+        if self.mamba_cache_mode == "all":
             max_model_len = vllm_config.model_config.max_model_len
             return cdiv(max_model_len, self.block_size) * self.page_size_bytes
-        elif vllm_config.cache_config.mamba_cache_mode == "align":
+        elif self.mamba_cache_mode == "align":
             return self.page_size_bytes * (2 + self.num_speculative_blocks)
         else:
             return self.page_size_bytes * (1 + self.num_speculative_blocks)
@@ -644,6 +707,12 @@ class UniformTypeKVCacheSpecs(KVCacheSpec):
     def page_size_bytes(self) -> int:
         return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())
 
+    @property
+    def physical_page_size_bytes(self) -> int:
+        return sum(
+            spec.physical_page_size_bytes for spec in self.kv_cache_specs.values()
+        )
+
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_num_pages = max(
             cdiv(spec.max_memory_usage_bytes(vllm_config), spec.page_size_bytes)
@@ -756,6 +825,22 @@ class KVCacheGroupSpec:
     is_eagle_group: bool = False
 
 
+@dataclass(frozen=True)
+class KVCachePoolConfig:
+    """Metadata for one KV cache block-pool namespace.
+
+    ``num_blocks`` includes the reserved null sentinel block. Therefore the
+    number of allocatable blocks in the pool is ``num_blocks - 1``.
+    """
+
+    pool_id: int
+    memory_model: MemoryModel
+    group_ids: tuple[int, ...]
+    num_blocks: int
+    accounting_page_size_bytes: int
+    physical_page_size_bytes: int
+
+
 @dataclass
 class KVCacheConfig:
     """
@@ -774,6 +859,127 @@ class KVCacheConfig:
     For models with multiple types of attention, there will be multiple groups,
     see `_get_kv_cache_config_uniform_page_size` for more details.
     """
+    pool_configs: tuple[KVCachePoolConfig, ...] = ()
+    """Metadata for KV cache block-pool namespaces."""
+    group_to_pool_id: tuple[int, ...] = ()
+    """Mapping from KV cache group id to pool id."""
+
+    def __post_init__(self) -> None:
+        if len(self.kv_cache_groups) == 0:
+            assert self.pool_configs == ()
+            assert self.group_to_pool_id == ()
+            return
+
+        if not self.pool_configs and not self.group_to_pool_id:
+            pool_configs, group_to_pool_id = self._make_legacy_pool_metadata()
+            self.pool_configs = pool_configs
+            self.group_to_pool_id = group_to_pool_id
+            return
+
+        assert len(self.group_to_pool_id) == len(self.kv_cache_groups)
+        pool_ids = {pool.pool_id for pool in self.pool_configs}
+        assert pool_ids == set(range(len(self.pool_configs)))
+        assert set(self.group_to_pool_id).issubset(pool_ids)
+
+    def _make_legacy_pool_metadata(
+        self,
+        num_blocks: int | None = None,
+    ) -> tuple[tuple[KVCachePoolConfig, ...], tuple[int, ...]]:
+        """Derive pool metadata for legacy direct ``KVCacheConfig`` callers.
+
+        Production multi-pool configs should pass explicit pool metadata. This
+        fallback keeps the old single shared block-pool behavior. It cannot
+        compute request-constant compact capacity because it has no scheduler
+        context.
+        """
+        specs = [group.kv_cache_spec for group in self.kv_cache_groups]
+        accounting_page_size = max(spec.accounting_page_size_bytes for spec in specs)
+        physical_page_sizes = {spec.physical_page_size_bytes for spec in specs}
+        if len(physical_page_sizes) == 1:
+            pool_physical_page_size_bytes = physical_page_sizes.pop()
+        else:
+            pool_physical_page_size_bytes = accounting_page_size
+
+        pool_config = KVCachePoolConfig(
+            pool_id=0,
+            memory_model=MemoryModel.TOKEN_PROPORTIONAL,
+            group_ids=tuple(range(len(self.kv_cache_groups))),
+            num_blocks=self.num_blocks if num_blocks is None else num_blocks,
+            accounting_page_size_bytes=accounting_page_size,
+            physical_page_size_bytes=pool_physical_page_size_bytes,
+        )
+        return (pool_config,), tuple(0 for _ in self.kv_cache_groups)
+
+    def _legacy_num_blocks_pool_id(self) -> int | None:
+        """Return the pool represented by the legacy ``num_blocks`` field."""
+        if not self.pool_configs:
+            return None
+        for pool in self.pool_configs:
+            if pool.memory_model == MemoryModel.TOKEN_PROPORTIONAL:
+                return pool.pool_id
+        return self.pool_configs[0].pool_id
+
+    def _refresh_multi_pool_metadata(self) -> None:
+        """Refresh explicit pool metadata without collapsing it to one pool."""
+        assert self.pool_configs
+        assert len(self.group_to_pool_id) == len(self.kv_cache_groups)
+
+        group_ids_by_pool: dict[int, list[int]] = {
+            pool.pool_id: [] for pool in self.pool_configs
+        }
+        for group_id, pool_id in enumerate(self.group_to_pool_id):
+            group_ids_by_pool[pool_id].append(group_id)
+
+        legacy_pool_id = self._legacy_num_blocks_pool_id()
+        refreshed_pool_configs: list[KVCachePoolConfig] = []
+        for pool in self.pool_configs:
+            group_ids = tuple(group_ids_by_pool[pool.pool_id])
+            assert group_ids
+            specs = [
+                self.kv_cache_groups[group_id].kv_cache_spec for group_id in group_ids
+            ]
+            memory_models = {spec.memory_model for spec in specs}
+            assert memory_models == {pool.memory_model}
+
+            accounting_page_sizes = {spec.accounting_page_size_bytes for spec in specs}
+            assert len(accounting_page_sizes) == 1
+            physical_page_sizes = {spec.physical_page_size_bytes for spec in specs}
+            if len(physical_page_sizes) == 1:
+                physical_page_size_bytes = physical_page_sizes.pop()
+            else:
+                physical_page_size_bytes = next(iter(accounting_page_sizes))
+
+            refreshed_pool_configs.append(
+                replace(
+                    pool,
+                    group_ids=group_ids,
+                    num_blocks=(
+                        self.num_blocks
+                        if pool.pool_id == legacy_pool_id
+                        else pool.num_blocks
+                    ),
+                    accounting_page_size_bytes=accounting_page_sizes.pop(),
+                    physical_page_size_bytes=physical_page_size_bytes,
+                )
+            )
+
+        self.pool_configs = tuple(refreshed_pool_configs)
+
+    def refresh_legacy_pool_metadata(self) -> None:
+        """Regenerate pool metadata after mutating config fields."""
+        if len(self.kv_cache_groups) == 0:
+            self.pool_configs = ()
+            self.group_to_pool_id = ()
+            return
+        if len(self.pool_configs) > 1 or any(
+            pool.memory_model == MemoryModel.REQUEST_CONSTANT
+            for pool in self.pool_configs
+        ):
+            self._refresh_multi_pool_metadata()
+            return
+        pool_configs, group_to_pool_id = self._make_legacy_pool_metadata()
+        self.pool_configs = pool_configs
+        self.group_to_pool_id = group_to_pool_id
 
     @property
     def has_mamba_layers(self) -> bool:
@@ -781,4 +987,11 @@ def has_mamba_layers(self) -> bool:
 
     @property
     def needs_kv_cache_zeroing(self) -> bool:
+        """Whether the GPU worker should initialize the attention KV zeroer.
+
+        Despite the broad name, this gate currently controls the attention-only
+        ``KVBlockZeroer`` in hybrid models. The zeroer skips Mamba tensors, and
+        ``SingleTypeKVCacheManager`` only records full-attention block IDs for
+        zeroing.
+        """
         return self.has_mamba_layers
diff --git a/vllm/v1/kv_offload/base.py b/vllm/v1/kv_offload/base.py
index dc9d95499cb6..3d403ea50837 100644
--- a/vllm/v1/kv_offload/base.py
+++ b/vllm/v1/kv_offload/base.py
@@ -331,12 +331,20 @@ def __init__(self, vllm_config: VllmConfig, kv_cache_config: KVCacheConfig):
         assert kv_transfer_config is not None
         self.extra_config = kv_transfer_config.kv_connector_extra_config
 
+        parallel_config = vllm_config.parallel_config
+        context_parallel_factor = (
+            parallel_config.decode_context_parallel_size
+            * parallel_config.prefill_context_parallel_size
+        )
+
         # block size used by vLLM for hashing request tokens for the sake
         # of enabling prefix caching
-        self.hash_block_size = vllm_config.cache_config.block_size
+        self.hash_block_size = (
+            vllm_config.cache_config.block_size * context_parallel_factor
+        )
         # gpu block size per group
         self.gpu_block_size: tuple[int, ...] = tuple(
-            kv_cache_group.kv_cache_spec.block_size
+            kv_cache_group.kv_cache_spec.block_size * context_parallel_factor
             for kv_cache_group in kv_cache_config.kv_cache_groups
         )
 
diff --git a/vllm/v1/kv_offload/cpu/manager.py b/vllm/v1/kv_offload/cpu/manager.py
index 90d56915ccf4..80bcb568f99a 100644
--- a/vllm/v1/kv_offload/cpu/manager.py
+++ b/vllm/v1/kv_offload/cpu/manager.py
@@ -86,7 +86,11 @@ def _get_load_store_spec(
 
     def lookup(self, key: OffloadKey, req_context: ReqContext) -> bool | None:
         block = self._policy.get(key)
-        return block is not None and block.is_ready
+        if block is None:
+            return False
+        if not block.is_ready:
+            return None  # write in-flight; caller should retry
+        return True
 
     def prepare_load(
         self,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 1f102ec61783..4a7c7a5f381a 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -198,6 +198,9 @@ class ModelRunnerOutput:
     # req_id -> num_nans_in_logits
     num_nans_in_logits: dict[str, int] | None = None
 
+    # req_id -> routed experts ndarray of shape (seq_len, num_moe_layers, top_k)
+    routed_experts_dict: dict[str, np.ndarray] | None = None
+
     # information related to cudagraph execution
     cudagraph_stats: CUDAGraphStat | None = None
 
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 70843be39695..363b113f0a4f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -82,6 +82,11 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
                 self.forward = self.forward_native
             else:
                 self.forward = self.forward_cpu
+        elif current_platform.is_xpu():
+            if envs.VLLM_XPU_USE_SAMPLER_KERNEL:
+                self.forward = self.forward_xpu
+            else:
+                self.forward = self.forward_native
         elif (
             logprobs_mode not in ("processed_logits", "processed_logprobs")
             and rocm_aiter_ops.is_enabled()
@@ -243,6 +248,49 @@ def aiter_sample(
             return torch.multinomial(renorm_probs, num_samples=1).view(-1)
         raise RuntimeError("aiter_sample was called with no active top-k or top-p.")
 
+    def forward_xpu(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if generators:
+            logger.warning_once(
+                "xpu kernel topk_topp_sampler does not support "
+                "per-request generators. Falling back to "
+                "PyTorch-native implementation."
+            )
+            return self.forward_native(logits, generators, k, p)
+        random_sampled = torch.empty(
+            logits.shape[0], dtype=torch.int64, device=logits.device
+        )
+        logits_to_return = None
+        if (
+            self.logprobs_mode == "processed_logits"
+            or self.logprobs_mode == "processed_logprobs"
+        ):
+            logits_to_return = torch.empty_like(logits)
+
+        assert len(generators) != logits.shape[0], (
+            "xpu kernel topk_topp_sampler does not support batch-wise generators."
+        )
+        generator = torch.xpu.default_generators[logits.device.index]
+
+        state = generator.get_state()
+        seed, offset = state.view(torch.int64)
+        seeds = torch.tensor(
+            [seed, offset], dtype=torch.int64, device=torch.device("cpu")
+        )
+        # The XPU kernel expects k as int64 (Long), but the input batch
+        # stores top_k as int32. Cast here to avoid dtype mismatch.
+        if k is not None:
+            k = k.to(torch.int64)
+        torch.ops.vllm.xpu_topk_topp_sampler(
+            random_sampled, logits_to_return, logits, k, p, self.logprobs_mode, seeds
+        )
+        return random_sampled, logits_to_return
+
 
 # Note: this is a workaround for
 # https://github.com/pytorch/pytorch/pull/151218
diff --git a/vllm/v1/simple_kv_offload/cuda_mem_ops.py b/vllm/v1/simple_kv_offload/cuda_mem_ops.py
index 03338421c457..b4c68aff3ca9 100644
--- a/vllm/v1/simple_kv_offload/cuda_mem_ops.py
+++ b/vllm/v1/simple_kv_offload/cuda_mem_ops.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Low-level CUDA memory helpers: pinning and batch DMA transfers."""
+"""Low-level CUDA/HIP memory helpers: pinning and batch DMA transfers."""
 
 import ctypes
 from typing import Any, NamedTuple
@@ -9,6 +9,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -39,7 +40,7 @@ class _CUmemcpyAttributes(ctypes.Structure):
 
 
 _BATCH_MEMCPY_FUNC_TYPE = ctypes.CFUNCTYPE(
-    ctypes.c_uint,  # CUresult
+    ctypes.c_uint,  # CUresult / hipError_t
     ctypes.c_void_p,
     ctypes.c_void_p,
     ctypes.c_void_p,
@@ -56,7 +57,42 @@ class _CUmemcpyAttributes(ctypes.Structure):
 
 
 def _resolve_batch_memcpy():
-    """Resolve cuMemcpyBatchAsync via cuGetProcAddress (one-time)."""
+    """Resolve the platform batch-memcpy entry point (one-time).
+
+    * CUDA: ``cuMemcpyBatchAsync`` via ``cuGetProcAddress`` (uses
+      srcAccessOrder=STREAM via one attributes entry).
+    * ROCm: ``hipMemcpyBatchAsync`` from libamdhip64 (ROCm 7.1+). ROCm
+      7.2.1 or 7.2.2 rejects any call with ``numAttrs > 0``
+      (see ROCm/clr @ rocm-7.2.1 hipamd/src/hip_memory.cpp:2819-2822), so
+      we call with ``numAttrs=0``.
+
+    Raises ``RuntimeError`` if the symbol is unavailable (older CUDA
+    driver, ROCm < 7.1, unusual install). The connector requires the
+    batch API.
+    """
+    if current_platform.is_rocm():
+        try:
+            lib = ctypes.CDLL("libamdhip64.so", mode=ctypes.RTLD_GLOBAL)
+            fn = lib.hipMemcpyBatchAsync
+        except (OSError, AttributeError) as e:
+            raise RuntimeError(
+                "hipMemcpyBatchAsync is unavailable in this ROCm install; "
+                "SimpleCPUOffloadConnector requires ROCm 7.1+."
+            ) from e
+        fn.restype = ctypes.c_uint
+        fn.argtypes = [
+            ctypes.c_void_p,  # dsts
+            ctypes.c_void_p,  # srcs
+            ctypes.c_void_p,  # sizes
+            ctypes.c_size_t,  # count
+            ctypes.c_void_p,  # attrs
+            ctypes.c_void_p,  # attrIdxs
+            ctypes.c_size_t,  # numAttrs
+            ctypes.c_void_p,  # failIdx
+            ctypes.c_void_p,  # stream
+        ]
+        return fn
+
     from cuda.bindings import driver as drv
 
     err, ptr, _ = drv.cuGetProcAddress(b"cuMemcpyBatchAsync", 12080, 0)
@@ -70,6 +106,8 @@ class BatchMemcpyParams(NamedTuple):
     dst_bases: np.ndarray  # [num_layers] uint64
     bpb: np.ndarray  # [num_layers] uint64 — bytes per block
     num_layers: int
+    # CUDA only: one attributes entry with srcAccessOrder=ANY. Unused on
+    # ROCm (7.2.1 or 7.2.2) because the current runtime rejects numAttrs > 0.
     attrs: _CUmemcpyAttributes
     attrs_idx: ctypes.c_size_t
     # NOTE: cuMemcpyBatchAsync_v2() removed fail_idx field, but we use
@@ -99,8 +137,10 @@ def build_params(
         dst_bases.append(d.data_ptr())
         bpb.append(s_bpb)
 
-    # Refer to https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1ff58e3065df3eb4b573dba77ad31f for details.  # noqa: E501
-    attrs = _CUmemcpyAttributes(srcAccessOrder=3)  # ANY
+    # ``srcAccessOrder=3`` == CU_MEMCPY_SRC_ACCESS_ORDER_ANY /
+    # hipMemcpySrcAccessOrderAny. See
+    # https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1ff58e3065df3eb4b573dba77ad31f  # noqa: E501
+    attrs = _CUmemcpyAttributes(srcAccessOrder=3)
 
     return BatchMemcpyParams(
         src_bases=np.array(src_bases, dtype=np.uint64),
@@ -119,7 +159,7 @@ def copy_blocks(
     dst_block_ids: list[int],
     params: BatchMemcpyParams,
 ) -> None:
-    """Copy blocks via cuMemcpyBatchAsync."""
+    """Copy blocks via cuMemcpyBatchAsync / hipMemcpyBatchAsync."""
     n = len(src_block_ids)
     if n == 0:
         return
@@ -134,8 +174,13 @@ def copy_blocks(
         params.dst_bases[:, None] + dst_ids[None, :] * params.bpb[:, None]
     ).ravel()
     sz_all = np.repeat(params.bpb, n)
-
     total = n * params.num_layers
+
+    # ROCm 7.2.1/7.2.2 rejects any call with numAttrs>0 (hipMemcpyBatchAsync
+    # hipamd/src/hip_memory.cpp:2819-2822); CUDA uses one attrs entry so
+    # srcAccessOrder is honored. attrs / attrsIdxs are ignored when
+    # numAttrs==0, so we pass the same values from both paths.
+    num_attrs = 0 if current_platform.is_rocm() else 1
     err = _batch_memcpy_fn(
         dst_all.ctypes.data,
         src_all.ctypes.data,
@@ -143,11 +188,11 @@ def copy_blocks(
         total,
         ctypes.addressof(params.attrs),
         ctypes.byref(params.attrs_idx),
-        1,
+        num_attrs,
         ctypes.byref(params.fail_idx),
         params.stream_handle,
     )
     if err != 0:
         raise RuntimeError(
-            f"cuMemcpyBatchAsync failed: err={err} failIdx={params.fail_idx.value}"
+            f"batch memcpy failed: err={err} failIdx={params.fail_idx.value}"
         )
diff --git a/vllm/v1/simple_kv_offload/manager.py b/vllm/v1/simple_kv_offload/manager.py
index 846526e5bee4..9d1a7f205d0d 100644
--- a/vllm/v1/simple_kv_offload/manager.py
+++ b/vllm/v1/simple_kv_offload/manager.py
@@ -5,7 +5,7 @@
 import contextlib
 from collections.abc import Iterable
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 from vllm.config import VllmConfig
 from vllm.distributed.kv_events import KVCacheEvent
@@ -21,6 +21,7 @@
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     MambaSpec,
+    MemoryModel,
     SlidingWindowSpec,
 )
 from vllm.v1.outputs import KVConnectorOutput
@@ -120,7 +121,7 @@ def __init__(
             pcp_world_size=pcp_world_size,
             hash_block_size=self.block_size,
         )
-        self.cpu_block_pool: BlockPool = self.cpu_coordinator.block_pool
+        self.cpu_block_pool = cast(BlockPool, self.cpu_coordinator.block_pool)
 
         # GPU block pool reference - bound after scheduler builds kv_cache_manager
         self._gpu_block_pool: BlockPool | None = None
@@ -167,6 +168,15 @@ def _derive_cpu_config(
         from vllm.v1.kv_cache_interface import KVCacheTensor
 
         assert len(gpu_config.kv_cache_tensors) > 0
+        if any(
+            pool.memory_model == MemoryModel.REQUEST_CONSTANT
+            for pool in gpu_config.pool_configs
+        ):
+            raise NotImplementedError(
+                "CPU KV cache offload with REQUEST_CONSTANT specs "
+                "(Mamba in 'none' or 'align' mode) is not supported. Set "
+                "mamba_cache_mode='all' or disable offload."
+            )
 
         gpu_total_bytes = sum(t.size for t in gpu_config.kv_cache_tensors)
         num_gpu_blocks = gpu_config.num_blocks
diff --git a/vllm/v1/spec_decode/dflash.py b/vllm/v1/spec_decode/dflash.py
index 2143b6a3a98e..db74044f4fde 100644
--- a/vllm/v1/spec_decode/dflash.py
+++ b/vllm/v1/spec_decode/dflash.py
@@ -80,9 +80,8 @@ def _create_draft_vllm_config(self) -> VllmConfig:
         )
 
     @override
-    def _raise_if_multimodal(self):
+    def _warn_if_multimodal(self):
         # Override to allow multimodal inputs since DFlash supports Qwen3.5 models
-        # Support for multimodal inputs has not been tested.
         pass
 
     @override
diff --git a/vllm/v1/spec_decode/gemma4.py b/vllm/v1/spec_decode/gemma4.py
new file mode 100644
index 000000000000..b0a02774faf6
--- /dev/null
+++ b/vllm/v1/spec_decode/gemma4.py
@@ -0,0 +1,335 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Gemma4 MTP (Multi-Token Prediction) proposer for speculative decoding.
+
+The Gemma4 assistant model runs all decoder layers per draft step
+(producing one token), and all its attention layers share KV cache
+with the target model via cross-model KV sharing.
+"""
+
+from collections import defaultdict
+from copy import copy
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig, get_layers_from_vllm_config, replace
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backend import CommonAttentionMetadata
+from vllm.v1.kv_cache_interface import (
+    KVCacheConfig,
+    KVCacheSpec,
+    UniformTypeKVCacheSpecs,
+)
+from vllm.v1.spec_decode.llm_base_proposer import SpecDecodeBaseProposer
+from vllm.v1.worker.utils import AttentionGroup
+
+logger = init_logger(__name__)
+
+
+class Gemma4Proposer(SpecDecodeBaseProposer):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        runner=None,
+    ):
+        super().__init__(
+            vllm_config,
+            device,
+            pass_hidden_states_to_model=True,
+            runner=runner,
+        )
+        # All draft steps predict from the same position (the last
+        # target-model position), so positions and seq_lens must not
+        # advance between steps.
+        self.constant_draft_positions = True
+
+        # Per-group block tables for multi-group KV cache models.
+        # Populated by gpu_model_runner during _prepare_inputs.
+        self._per_group_block_tables: dict[int, torch.Tensor] = {}
+
+        # Centroids CUDA graphs — populated in load_model if centroids
+        # masking is active. _centroids_sizes is pre-sorted for fast
+        # lookup in _greedy_sample.
+        self._centroids_sizes: list[int] = []
+        self._centroids_graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self._centroids_inputs: dict[int, torch.Tensor] = {}
+        self._centroids_outputs: dict[int, torch.Tensor] = {}
+
+    def set_per_group_block_table(self, gid: int, block_table: torch.Tensor) -> None:
+        self._per_group_block_tables[gid] = block_table
+
+    def model_returns_tuple(self) -> bool:
+        # forward() returns (draft_hidden_states, backbone_hidden_states).
+        # The proposer uses draft_hidden_states for compute_logits and
+        # backbone_hidden_states for the hidden-state feedback buffer.
+        return True
+
+    def build_per_group_and_layer_attn_metadata(
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
+        draft_index: int = 0,
+    ) -> tuple[list[object], dict[str, object]]:
+        """Build attention metadata using the correct block table per group.
+
+        Gemma4 has multiple KV cache groups (sliding vs full attention)
+        with different block tables.  The base class receives a single
+        common_attn_metadata whose block_table belongs to one group.
+        We swap in the correct block table for each draft attention group.
+        """
+        per_group_attn_metadata: list[object] = []
+        per_layer_attn_metadata: dict[str, object] = {}
+        for attn_group in self.draft_attn_groups:
+            gid = attn_group.kv_cache_group_id
+            if gid in self._per_group_block_tables:
+                cm = copy(common_attn_metadata)
+                cm.block_table_tensor = self._per_group_block_tables[gid]
+            else:
+                cm = common_attn_metadata
+            attn_metadata = attn_group.get_metadata_builder().build_for_drafting(
+                common_attn_metadata=cm, draft_index=draft_index
+            )
+            per_group_attn_metadata.append(attn_metadata)
+            for layer_name in attn_group.layer_names:
+                per_layer_attn_metadata[layer_name] = attn_metadata
+        return per_group_attn_metadata, per_layer_attn_metadata
+
+    def _greedy_sample(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self._centroids_sizes:
+            T = hidden_states.shape[0]
+            for size in self._centroids_sizes:
+                if size >= T:
+                    self._centroids_inputs[size][:T].copy_(hidden_states)
+                    self._centroids_graphs[size].replay()
+                    return self._centroids_outputs[size][:T].clone()
+            return self.model.get_top_tokens(hidden_states)
+        return super()._greedy_sample(hidden_states)
+
+    def _setup_centroids_cuda_graphs(self) -> None:
+        """Capture CUDA graphs for centroids get_top_tokens at key sizes."""
+        masked_emb = self.model.masked_embedding
+        lm_head_weight = self.model._get_full_lm_head_weight()
+
+        for size in [1, 2, 4, 8, 16, 32, 64]:
+            static_input = torch.zeros(
+                size,
+                masked_emb.hidden_size,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            for _ in range(3):
+                masked_emb.get_top_tokens(static_input, lm_head_weight)
+            torch.accelerator.synchronize()
+
+            g = torch.cuda.CUDAGraph()
+            with torch.cuda.graph(g):
+                static_output = masked_emb.get_top_tokens(
+                    static_input,
+                    lm_head_weight,
+                )
+            self._centroids_graphs[size] = g
+            self._centroids_inputs[size] = static_input
+            self._centroids_outputs[size] = static_output
+
+        self._centroids_sizes = sorted(self._centroids_graphs)
+        logger.info(
+            "Gemma4 MTP: captured centroids CUDA graphs for sizes %s.",
+            self._centroids_sizes,
+        )
+
+    def _create_draft_vllm_config(self) -> VllmConfig:
+        """Preserve the target's forced TRITON_ATTN backend for draft layers.
+
+        Gemma4 forces TRITON_ATTN due to heterogeneous head dimensions
+        (head_dim=256 sliding, global_head_dim=512 full). The base class
+        resets attention_config.backend to None for draft models, causing
+        sliding layers to fall back to FLASH_ATTN which cannot handle
+        KV-shared cache. Override to carry the target's backend through.
+        """
+        base = super()._create_draft_vllm_config()
+        target_backend = self.vllm_config.attention_config.backend
+        if target_backend is not None:
+            base = replace(
+                base,
+                attention_config=replace(
+                    base.attention_config,
+                    backend=target_backend,
+                ),
+            )
+        return base
+
+    def _maybe_share_lm_head(self, target_language_model: nn.Module) -> None:
+        """Gemma4 MTP always keeps its own draft-dim lm_head.
+
+        The draft model's lm_head operates in draft hidden_size (e.g. 256),
+        which differs from the target's backbone hidden_size (e.g. 1536).
+        Sharing would break compute_logits (and centroids masking when
+        use_ordered_embeddings is enabled).
+        """
+        logger.info(
+            "Gemma4 MTP: keeping draft model's own lm_head (draft_dim != backbone_dim)."
+        )
+
+    def load_model(self, target_model: nn.Module) -> None:
+        target_attn_layer_names = set(
+            get_layers_from_vllm_config(
+                self.vllm_config,
+                AttentionLayerBase,  # type: ignore[type-abstract]
+            ).keys()
+        )
+
+        super().load_model(target_model)
+
+        self._setup_gemma4_kv_sharing(target_attn_layer_names)
+
+        if getattr(self.model, "masked_embedding", None) is not None:
+            self._setup_centroids_cuda_graphs()
+
+    def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
+        """Draft layers span multiple KV cache groups (sliding + full
+        attention with different head dimensions), so skip the base
+        class single-group assertion."""
+
+    def initialize_attn_backend(
+        self,
+        kv_cache_config: KVCacheConfig,
+        kernel_block_sizes: list[int] | None = None,
+    ) -> None:
+        """Create separate AttentionGroup objects per KV cache spec
+        so that each head-dim variant gets its own metadata builder."""
+        all_attn_layers = get_layers_from_vllm_config(
+            self.vllm_config,
+            AttentionLayerBase,  # type: ignore[type-abstract]
+        )
+
+        layer_to_gid: dict[str, int] = {}
+        layer_to_spec: dict[str, KVCacheSpec] = {}
+        for gid, group in enumerate(kv_cache_config.kv_cache_groups):
+            group_spec = group.kv_cache_spec
+            for ln in group.layer_names:
+                layer_to_gid[ln] = gid
+                if isinstance(group_spec, UniformTypeKVCacheSpecs):
+                    if ln in group_spec.kv_cache_specs:
+                        layer_to_spec[ln] = group_spec.kv_cache_specs[ln]
+                    else:
+                        tgt = getattr(
+                            all_attn_layers.get(ln),
+                            "kv_sharing_target_layer_name",
+                            None,
+                        )
+                        if tgt and tgt in group_spec.kv_cache_specs:
+                            layer_to_spec[ln] = group_spec.kv_cache_specs[tgt]
+                        else:
+                            layer_to_spec[ln] = group_spec
+                else:
+                    layer_to_spec[ln] = group_spec
+
+        attention_groups: dict[tuple[tuple[str, str], KVCacheSpec], AttentionGroup] = {}
+        for layer_name in self._draft_attn_layer_names:
+            if layer_name not in layer_to_spec:
+                continue
+            attn_layer = all_attn_layers[layer_name]
+            attn_backend = attn_layer.get_attn_backend()
+            spec = layer_to_spec[layer_name]
+            gid = layer_to_gid[layer_name]
+            group_key = (attn_backend.full_cls_name(), spec)
+
+            if group_key not in attention_groups:
+                kernel_block_size = (
+                    kernel_block_sizes[gid]
+                    if kernel_block_sizes is not None and gid < len(kernel_block_sizes)
+                    else None
+                )
+                attn_group = AttentionGroup(
+                    backend=attn_backend,
+                    layer_names=[layer_name],
+                    kv_cache_spec=spec,
+                    kv_cache_group_id=gid,
+                )
+                attn_group.create_metadata_builders(
+                    self.vllm_config,
+                    self.device,
+                    kernel_block_size=kernel_block_size,
+                )
+                attention_groups[group_key] = attn_group
+            else:
+                attention_groups[group_key].layer_names.append(layer_name)
+
+        self.draft_attn_groups = list(attention_groups.values())
+        if self.draft_attn_groups:
+            self.kv_cache_gid = self.draft_attn_groups[0].kv_cache_group_id
+            self.block_size = (
+                self.draft_attn_groups[0]
+                .get_metadata_builder()
+                .kv_cache_spec.block_size
+            )
+        else:
+            self.kv_cache_gid = 0
+            self.block_size = kv_cache_config.kv_cache_groups[
+                0
+            ].kv_cache_spec.block_size
+        logger.debug("Using block size %d for drafting layers", self.block_size)
+
+    def _setup_gemma4_kv_sharing(
+        self,
+        target_attn_layer_names: set[str],
+    ) -> None:
+        """Wire draft layers to share KV with the target model.
+
+        Each draft decoder layer is mapped to the last non-KV-shared
+        target layer of the same attention type (sliding or full).
+        """
+        draft_config = self.speculative_config.draft_model_config.hf_config
+        draft_text_config = draft_config.get_text_config()
+        target_config = self.vllm_config.model_config.hf_config
+        target_text_config = target_config.get_text_config()
+        target_layer_types = getattr(target_text_config, "layer_types", [])
+
+        if not (hasattr(self.model, "model") and hasattr(self.model.model, "layers")):
+            return
+
+        target_num_kv_shared = getattr(target_text_config, "num_kv_shared_layers", 0)
+        num_non_shared = len(target_layer_types) - target_num_kv_shared
+        type_to_target_indices: dict[str, list[int]] = defaultdict(list)
+        for idx, lt in enumerate(target_layer_types[:num_non_shared]):
+            type_to_target_indices[lt].append(idx)
+
+        target_prefix = "model.layers"
+        for name in target_attn_layer_names:
+            if ".layers." in name:
+                target_prefix = name.split(".layers.")[0] + ".layers"
+                break
+
+        draft_layer_types = getattr(draft_text_config, "layer_types", [])
+        for draft_idx, layer in enumerate(self.model.model.layers):
+            if not hasattr(layer, "self_attn"):
+                continue
+            attn = getattr(layer.self_attn, "attn", None)
+            if attn is None:
+                continue
+
+            draft_layer_type = (
+                draft_layer_types[draft_idx]
+                if draft_idx < len(draft_layer_types)
+                else "full_attention"
+            )
+            candidates = type_to_target_indices.get(draft_layer_type, [])
+            if not candidates:
+                logger.warning(
+                    "No target layer of type '%s' for draft layer %d",
+                    draft_layer_type,
+                    draft_idx,
+                )
+                continue
+
+            target_idx = candidates[-1]
+            target_layer_name = f"{target_prefix}.{target_idx}.self_attn.attn"
+            attn.kv_sharing_target_layer_name = target_layer_name
+            logger.info(
+                "Gemma4 MTP: draft layer %d (%s) -> %s",
+                draft_idx,
+                draft_layer_type,
+                target_layer_name,
+            )
diff --git a/vllm/v1/spec_decode/llm_base_proposer.py b/vllm/v1/spec_decode/llm_base_proposer.py
index 8d38038734b3..8ee349a1cc0d 100644
--- a/vllm/v1/spec_decode/llm_base_proposer.py
+++ b/vllm/v1/spec_decode/llm_base_proposer.py
@@ -105,6 +105,12 @@ def __init__(
         )
         self.needs_extra_input_slots = self.net_num_new_slots_per_request > 0
 
+        # When True, all draft steps reuse the same position as the
+        # first step instead of advancing by one each iteration.
+        # Used by draft models with Q-only attention that share KV
+        # with the target and always predict from the same position.
+        self.constant_draft_positions: bool = False
+
         self.parallel_drafting_token_id: int = 0
         self.parallel_drafting_hidden_state_tensor: torch.Tensor | None = None
         if self.parallel_drafting:
@@ -193,7 +199,7 @@ def __init__(
 
         if self.needs_extra_input_slots:
             self._raise_if_padded_drafter_batch_disabled()
-            self._raise_if_multimodal()
+            self._warn_if_multimodal()
             self._raise_if_mrope()
 
         self.is_rejected_token_mask: torch.Tensor | None = None
@@ -309,11 +315,12 @@ def _raise_if_padded_drafter_batch_disabled(self):
                 "disable_padded_drafter_batch in the speculative_config."
             )
 
-    def _raise_if_multimodal(self):
+    def _warn_if_multimodal(self):
         if self.supports_mm_inputs:
-            raise NotImplementedError(
+            logger.warning(
                 "Speculative Decoding with draft models or parallel drafting "
-                "does not support multimodal models yet"
+                "does not fully support multimodal models yet. "
+                "Proceeding with text-only speculative decoding."
             )
 
     def _raise_if_mrope(self):
@@ -388,9 +395,9 @@ def _get_slot_mapping(
         return {name: view for name in self._draft_attn_layer_names}
 
     def initialize_cudagraph_keys(self, cudagraph_mode: CUDAGraphMode) -> None:
-        """Initialize cudagraph dispatcher keys for eagle.
+        """Initialize cudagraph dispatcher keys for the drafter.
 
-        Eagle only supports PIECEWISE cudagraphs (via mixed_mode).
+        Only supports PIECEWISE cudagraphs (via mixed_mode).
         This should be called after adjust_cudagraph_sizes_for_spec_decode.
         """
         if (
@@ -499,6 +506,12 @@ def propose(
             positions = self.positions[token_indices_to_sample]
         hidden_states = hidden_states[token_indices_to_sample]
 
+        if self.constant_draft_positions:
+            # Write the sampling positions into the front of the
+            # positions buffer so that subsequent loop iterations
+            # (which read via _get_positions) use the correct values.
+            self.positions[:batch_size] = positions
+
         if any(isinstance(md, TreeAttentionMetadata) for md in per_group_attn_metadata):
             # Draft using tree attention - requires full logits for top-k
             logits = self.model.compute_logits(sample_hidden_states)
@@ -556,59 +569,25 @@ def propose(
             # cast to int32 is crucial when eagle model is compiled.
             # tensor.argmax() returns int64 by default.
             input_ids = draft_token_ids_list[-1].int()
-            # Use fused kernel for slot mapping and metadata updates.
-            # Write clamped positions directly into the positions buffer to
-            # avoid an extra D2D copy for the common (non-mrope) case.
-            positions_1d = positions[0] if self.uses_mrope else positions
-            if self.uses_mrope:
-                out_pos = self.mrope_positions[0, :batch_size]
-            elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
-                out_pos = self.xdrope_positions[0, :batch_size]
-            else:
-                out_pos = self.positions[:batch_size]
-            eagle_step_update_slot_mapping_and_metadata(
-                positions_1d=positions_1d,
-                block_table_tensor=common_attn_metadata.block_table_tensor,
-                seq_lens=common_attn_metadata.seq_lens,
-                block_size=block_size,
-                max_model_len=self.max_model_len,
-                out_clamped_positions=out_pos,
-                out_slot_mapping=self._slot_mapping_buffer[:input_batch_size],
-                input_batch_size=input_batch_size,
-            )
-            common_attn_metadata.slot_mapping = self._slot_mapping_buffer[:batch_size]
-            if self.uses_mrope:
-                self.mrope_positions[1:, :batch_size] = self.mrope_positions[
-                    0, :batch_size
-                ]
-                positions = self.mrope_positions[:, :batch_size]
-            elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
-                self.xdrope_positions[1:, :batch_size] = self.xdrope_positions[
-                    0, :batch_size
-                ]
-                positions = self.xdrope_positions[0, :batch_size]
-            else:
-                positions = self.positions[:batch_size]
-            # Increment the maximum sequence length. We increment max_seq_len
-            # unconditionally even though some seq_lens may have been capped above,
-            # as max_seq_len serves as an upper bound for sequence lengths.
-            common_attn_metadata.max_seq_len = min(
-                common_attn_metadata.max_seq_len + 1, self.max_model_len
-            )
 
-            # Also update the CPU-side shadow; NOTE: this is hacky and should be
-            # removed in when common_attn_metadata.seq_lens_cpu is deprecated.
-            if common_attn_metadata._seq_lens_cpu is not None:
-                common_attn_metadata._seq_lens_cpu += 1
-            if common_attn_metadata._num_computed_tokens_cpu is not None:
-                common_attn_metadata._num_computed_tokens_cpu += 1
-            if common_attn_metadata.seq_lens_cpu_upper_bound is not None:
-                common_attn_metadata.seq_lens_cpu_upper_bound += 1
-
-            # Rebuild attention metadata
-            _, per_layer_attn_metadata = self.build_per_group_and_layer_attn_metadata(
-                common_attn_metadata, draft_index=token_index + 1
-            )
+            if not self.constant_draft_positions:
+                positions = self._update_positions_dependent_metadata(
+                    positions,
+                    common_attn_metadata,
+                    batch_size,
+                    input_batch_size,
+                    block_size,
+                )
+
+            # Rebuild attention metadata. When draft positions are constant
+            # (e.g. Gemma4 MTP), common_attn_metadata is invariant across
+            # loop iterations so we build once and reuse.
+            if not self.constant_draft_positions or token_index == 0:
+                _, per_layer_attn_metadata = (
+                    self.build_per_group_and_layer_attn_metadata(
+                        common_attn_metadata, draft_index=token_index + 1
+                    )
+                )
 
             # copy inputs to buffer for cudagraph
             self.input_ids[:batch_size] = input_ids
@@ -654,6 +633,58 @@ def propose(
         draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
         return draft_token_ids
 
+    def _update_positions_dependent_metadata(
+        self,
+        positions: torch.Tensor,
+        common_attn_metadata,
+        batch_size: int,
+        input_batch_size: int,
+        block_size: int,
+    ) -> torch.Tensor:
+        """Update positions, slot mappings, and sequence metadata for the
+        next draft step. Returns the updated positions tensor."""
+        positions_1d = positions[0] if self.uses_mrope else positions
+        if self.uses_mrope:
+            out_pos = self.mrope_positions[0, :batch_size]
+        elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
+            out_pos = self.xdrope_positions[0, :batch_size]
+        else:
+            out_pos = self.positions[:batch_size]
+        eagle_step_update_slot_mapping_and_metadata(
+            positions_1d=positions_1d,
+            block_table_tensor=common_attn_metadata.block_table_tensor,
+            seq_lens=common_attn_metadata.seq_lens,
+            block_size=block_size,
+            max_model_len=self.max_model_len,
+            out_clamped_positions=out_pos,
+            out_slot_mapping=self._slot_mapping_buffer[:input_batch_size],
+            input_batch_size=input_batch_size,
+        )
+        common_attn_metadata.slot_mapping = self._slot_mapping_buffer[:batch_size]
+        if self.uses_mrope:
+            self.mrope_positions[1:, :batch_size] = self.mrope_positions[0, :batch_size]
+            positions = self.mrope_positions[:, :batch_size]
+        elif self.uses_xdrope_dim > 0 and self.draft_uses_xdrope_dim > 0:
+            self.xdrope_positions[1:, :batch_size] = self.xdrope_positions[
+                0, :batch_size
+            ]
+            positions = self.xdrope_positions[0, :batch_size]
+        else:
+            positions = self.positions[:batch_size]
+        common_attn_metadata.max_seq_len = min(
+            common_attn_metadata.max_seq_len + 1,
+            self.max_model_len,
+        )
+
+        if common_attn_metadata._seq_lens_cpu is not None:
+            common_attn_metadata._seq_lens_cpu += 1
+        if common_attn_metadata._num_computed_tokens_cpu is not None:
+            common_attn_metadata._num_computed_tokens_cpu += 1
+        if common_attn_metadata.seq_lens_cpu_upper_bound is not None:
+            common_attn_metadata.seq_lens_cpu_upper_bound += 1
+
+        return positions
+
     def set_inputs_first_pass(
         self,
         target_token_ids: torch.Tensor,
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index f46e8a8ed63c..87a2aac9d4ca 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -257,6 +257,13 @@ def __init__(
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        # Align to a multiple of (128 / block_size) as required
+        # by some attention backends such as TRTLLM (#39324)
+        max_num_blocks = [
+            cdiv(n, 128 // bs) * (128 // bs) if bs <= 128 else n
+            for n, bs in zip(max_num_blocks, block_sizes)
+        ]
+
         self.block_tables = [
             BlockTable(
                 block_size,
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 226257581265..f6df5d12cd42 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 from typing import Any, cast
 
-import numpy as np
 import torch
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
@@ -19,8 +18,11 @@
     AttentionSpec,
     KVCacheConfig,
     KVCacheSpec,
+    MambaSpec,
+    MemoryModel,
     UniformTypeKVCacheSpecs,
 )
+from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
 from vllm.v1.worker.utils import AttentionGroup, bind_kv_cache
 
 
@@ -30,6 +32,20 @@ class AttentionCGSupportInfo:
     min_cg_attn_backend: str | None = None
 
 
+def get_block_layout_page_size_bytes(spec: KVCacheSpec) -> int:
+    """Page size used by the physical block layout.
+
+    TOKEN_PROPORTIONAL specs allocate one page per scheduler block, so the
+    logical accounting page size and physical block page size are the same.
+    REQUEST_CONSTANT specs allocate one physical request-state page per block
+    and may use a different logical accounting size; reshape math must use the
+    physical page size.
+    """
+    if spec.memory_model == MemoryModel.REQUEST_CONSTANT:
+        return spec.physical_page_size_bytes
+    return spec.page_size_bytes
+
+
 def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
     kv_cache_spec: dict[str, KVCacheSpec] = {}
     layer_type = cast(type[Any], AttentionLayerBase)
@@ -147,68 +163,132 @@ def _reshape_kv_cache(
     kv_cache_raw_tensors: dict[str, torch.Tensor],
     attn_backends: dict[str, type[AttentionBackend]],
     cache_dtype: str,
-) -> dict[str, torch.Tensor]:
-    kv_caches: dict[str, torch.Tensor] = {}
+) -> dict[str, Any]:
+    kv_caches: dict[str, Any] = {}
+    has_attn, has_mamba = False, False
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
         for layer_name in kv_cache_group_spec.layer_names:
             kv_cache_spec = kv_cache_group_spec.kv_cache_spec
             if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
                 kv_cache_spec = kv_cache_spec.kv_cache_specs[layer_name]
-            assert isinstance(kv_cache_spec, AttentionSpec)
-
-            raw_tensor = kv_cache_raw_tensors[layer_name]
-            assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
-            num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
-
-            attn_backend = attn_backends[layer_name]
-            kv_cache_shape = attn_backend.get_kv_cache_shape(
-                num_blocks,
-                kv_cache_spec.storage_block_size,
-                kv_cache_spec.num_kv_heads,
-                kv_cache_spec.head_size,
-                cache_dtype,
-            )
 
-            # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
-            try:
-                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
-                assert len(kv_cache_stride_order) == len(kv_cache_shape)
-            except (AttributeError, NotImplementedError):
-                kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
-
-            kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
-            inv_order = [
-                kv_cache_stride_order.index(i)
-                for i in range(len(kv_cache_stride_order))
-            ]
-
-            dtype = kv_cache_spec.dtype
-            raw_tensor = raw_tensor.view(dtype)
-            if kv_cache_spec.page_size_padded is not None:
-                # Use strided view to handle page_size_bytes that
-                # include padding. This follows the same pattern as
-                # MambaSpec handling in gpu_model_runner.py.
-                # NOTE: This assumes kv_cache_shape[0] == num_blocks
-                # (i.e. the first physical dimension is the block
-                # index), which holds for MLA backends but NOT for
-                # standard attention backends whose shape starts with
-                # a K/V dimension of size 2.
-                dtype_size = get_dtype_size(dtype)
-                page_stride = kv_cache_spec.page_size_bytes // dtype_size
-                strides = list(torch.empty(kv_cache_shape).stride())
-                strides[inv_order[0]] = page_stride
-                kv_cache = torch.as_strided(
-                    raw_tensor,
-                    size=kv_cache_shape,
-                    stride=tuple(strides),
+            kv_raw_tensor = kv_cache_raw_tensors[layer_name]
+            block_layout_page_size = get_block_layout_page_size_bytes(kv_cache_spec)
+            assert kv_raw_tensor.numel() % block_layout_page_size == 0
+            num_blocks = kv_raw_tensor.numel() // block_layout_page_size
+
+            if isinstance(kv_cache_spec, AttentionSpec):
+                if kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT:
+                    raise NotImplementedError(
+                        "REQUEST_CONSTANT AttentionSpec is not supported. "
+                        "Attention KV cache is token-proportional."
+                    )
+                has_attn = True
+                attn_backend = attn_backends[layer_name]
+                kv_cache_shape = attn_backend.get_kv_cache_shape(
+                    num_blocks,
+                    kv_cache_spec.storage_block_size,
+                    kv_cache_spec.num_kv_heads,
+                    kv_cache_spec.head_size,
+                    cache_dtype_str=cache_dtype,
                 )
+
+                # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
+                try:
+                    kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+                    assert len(kv_cache_stride_order) == len(kv_cache_shape)
+                except (AttributeError, NotImplementedError):
+                    kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+                kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+                inv_order = [
+                    kv_cache_stride_order.index(i)
+                    for i in range(len(kv_cache_stride_order))
+                ]
+
+                dtype = kv_cache_spec.dtype
+                kv_tensor = kv_raw_tensor.view(dtype)
+                if kv_cache_spec.page_size_padded is not None:
+                    # Use strided view to handle page_size_bytes that
+                    # include padding. This follows the same pattern as
+                    # MambaSpec handling in gpu_model_runner.py.
+                    # NOTE: This assumes kv_cache_shape[0] == num_blocks
+                    # (i.e. the first physical dimension is the block
+                    # index), which holds for MLA backends but NOT for
+                    # standard attention backends whose shape starts with
+                    # a K/V dimension of size 2.
+                    dtype_size = get_dtype_size(dtype)
+                    page_stride = block_layout_page_size // dtype_size
+                    strides = list(torch.empty(kv_cache_shape).stride())
+                    strides[inv_order[0]] = page_stride
+                    kv_cache = torch.as_strided(
+                        kv_tensor,
+                        size=kv_cache_shape,
+                        stride=tuple(strides),
+                    )
+                else:
+                    # No padding — safe to use a contiguous view.
+                    kv_cache = kv_tensor.view(kv_cache_shape)
+                kv_caches[layer_name] = kv_cache.permute(*inv_order)
+
+            elif isinstance(kv_cache_spec, MambaSpec):
+                has_mamba = True
+                state_tensors = []
+                storage_offset_bytes = 0
+                for shape, dtype in zip(kv_cache_spec.shapes, kv_cache_spec.dtypes):
+                    dtype_size = get_dtype_size(dtype)
+                    num_element_per_page = block_layout_page_size // dtype_size
+                    target_shape = (num_blocks, *shape)
+                    stride = torch.empty(target_shape).stride()
+                    target_stride = (num_element_per_page, *stride[1:])
+                    assert storage_offset_bytes % dtype_size == 0
+                    tensor = torch.as_strided(
+                        kv_raw_tensor.view(dtype),
+                        size=target_shape,
+                        stride=target_stride,
+                        storage_offset=storage_offset_bytes // dtype_size,
+                    )
+                    state_tensors.append(tensor)
+                    storage_offset_bytes += stride[0] * dtype_size
+                kv_caches[layer_name] = state_tensors
             else:
-                # No padding — safe to use a contiguous view.
-                kv_cache = raw_tensor.view(kv_cache_shape)
-            kv_caches[layer_name] = kv_cache.permute(*inv_order)
+                raise NotImplementedError(
+                    f"Unsupported KV cache spec type: {type(kv_cache_spec)}"
+                )
+
+    if has_attn and has_mamba:
+        _update_hybrid_attention_layout(kv_caches, kv_cache_config)
+
     return kv_caches
 
 
+def _update_hybrid_attention_layout(
+    kv_caches: dict[str, Any],
+    kv_cache_config: KVCacheConfig,
+) -> None:
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        for layer_name in kv_cache_group_spec.layer_names:
+            kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+            if isinstance(kv_cache_spec, UniformTypeKVCacheSpecs):
+                kv_cache_spec = kv_cache_spec.kv_cache_specs[layer_name]
+            if not isinstance(kv_cache_spec, AttentionSpec):
+                continue
+            kv_cache = kv_caches[layer_name]
+            if kv_cache.shape[0] == 2:
+                assert kv_cache.shape[1] != 2, (
+                    f"Cannot determine layout for tensor of shape {kv_cache.shape}"
+                )
+                hidden_size = kv_cache.shape[2:].numel()
+                kv_cache.as_strided_(
+                    size=kv_cache.shape,
+                    stride=(
+                        hidden_size,
+                        2 * hidden_size,
+                        *kv_cache.stride()[2:],
+                    ),
+                )
+
+
 def init_kv_cache(
     runner_kv_caches: list[torch.Tensor],
     forward_context: dict[str, Any],
@@ -216,7 +296,7 @@ def init_kv_cache(
     attn_backends: dict[str, type[AttentionBackend]],
     device: torch.device,
     cache_dtype: str,
-) -> dict[str, torch.Tensor]:
+) -> dict[str, Any]:
     kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
     kv_caches = _reshape_kv_cache(
         kv_cache_config, kv_cache_raw_tensors, attn_backends, cache_dtype
@@ -250,8 +330,9 @@ def build_attn_metadata(
     kv_cache_config: KVCacheConfig,
     seq_lens_cpu_upper_bound: torch.Tensor | None = None,
     dcp_local_seq_lens: torch.Tensor | None = None,
-    encoder_seq_lens: dict[int, tuple[torch.Tensor, np.ndarray]] | None = None,
     positions: torch.Tensor | None = None,
+    model_specific_attn_metadata: ModelSpecificAttnMetadata | None = None,
+    for_cudagraph_capture: bool = False,
 ) -> dict[str, Any]:
     seq_lens = seq_lens[:num_reqs]
     if dcp_local_seq_lens is not None:
@@ -265,6 +346,11 @@ def build_attn_metadata(
         block_table = block_tables[i]
         slot_mapping = slot_mappings[i]
 
+        common_attn_metadata_extra_kwargs = (
+            model_specific_attn_metadata.get_extra_common_attn_kwargs(i, num_reqs)
+            if model_specific_attn_metadata is not None
+            else {}
+        )
         common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=query_start_loc_gpu,
             query_start_loc_cpu=query_start_loc_cpu,
@@ -279,17 +365,29 @@ def build_attn_metadata(
             causal=True,
             dcp_local_seq_lens=dcp_local_seq_lens,
             positions=positions,
+            **common_attn_metadata_extra_kwargs,
         )
-        if encoder_seq_lens and i in encoder_seq_lens:
-            encoder_seq_lens_gpu, encoder_seq_lens_cpu = encoder_seq_lens[i]
-            common_attn_metadata.encoder_seq_lens = encoder_seq_lens_gpu
-            common_attn_metadata.encoder_seq_lens_cpu = encoder_seq_lens_cpu
 
         for attn_group in attn_groups[i]:
             attn_metadata_builder = attn_group.get_metadata_builder(0)
-            metadata = attn_metadata_builder.build(
-                common_prefix_len=0, common_attn_metadata=common_attn_metadata
-            )
+            if for_cudagraph_capture:
+                metadata = attn_metadata_builder.build_for_cudagraph_capture(
+                    common_attn_metadata
+                )
+            else:
+                attn_metadata_extra_kwargs = (
+                    model_specific_attn_metadata.get_extra_attn_kwargs(
+                        attn_metadata_builder,
+                        num_reqs,
+                    )
+                    if model_specific_attn_metadata is not None
+                    else {}
+                )
+                metadata = attn_metadata_builder.build(
+                    common_prefix_len=0,
+                    common_attn_metadata=common_attn_metadata,
+                    **attn_metadata_extra_kwargs,
+                )
             for layer_name in attn_group.layer_names:
                 attn_metadata[layer_name] = metadata
     return attn_metadata
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index d2a2a6aedbf0..62454f70304d 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -5,7 +5,6 @@
 import torch
 
 from vllm.triton_utils import tl, triton
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import PAD_SLOT_ID
 from vllm.v1.worker.gpu.buffer_utils import StagedWriteTensor, UvaBackedTensor
 
@@ -16,7 +15,7 @@ def __init__(
         block_sizes: list[int],
         max_num_reqs: int,
         max_num_batched_tokens: int,
-        max_model_len: int,
+        max_num_blocks_per_group: list[int],
         device: torch.device,
         cp_size: int = 1,
         cp_rank: int = 0,
@@ -25,7 +24,6 @@ def __init__(
         self.block_sizes = block_sizes
         self.max_num_reqs = max_num_reqs
         self.max_num_batched_tokens = max_num_batched_tokens
-        self.max_model_len = max_model_len
         self.device = device
 
         self.cp_size = cp_size
@@ -33,14 +31,11 @@ def __init__(
         self.cp_interleave = cp_interleave
 
         self.num_kv_cache_groups = len(self.block_sizes)
+        assert len(max_num_blocks_per_group) == self.num_kv_cache_groups
         # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
         self.block_tables: list[StagedWriteTensor] = []
         for i in range(self.num_kv_cache_groups):
-            block_size = self.block_sizes[i]
-            # When using DCP, each request's KV cache is sharded among different ranks.
-            # As a result, one block on the current rank covers `block_size * cp_size`
-            # tokens in the full, global (unsharded) sequence.
-            max_num_blocks = cdiv(self.max_model_len, block_size * self.cp_size)
+            max_num_blocks = max_num_blocks_per_group[i]
             block_table = StagedWriteTensor(
                 (self.max_num_reqs, max_num_blocks),
                 dtype=torch.int32,
diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py
index a653c262556c..5963790a7792 100644
--- a/vllm/v1/worker/gpu/buffer_utils.py
+++ b/vllm/v1/worker/gpu/buffer_utils.py
@@ -167,7 +167,7 @@ def apply_write(self) -> None:
 
         # Special handling for write_contents
         write_contents = async_tensor_h2d(
-            self._staged_write_contents, self.dtype, self.device, pin_memory=True
+            self._staged_write_contents, self.dtype, self.device
         )
 
         # Write diffs to the GPU buffer
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
index 662f92e58158..71a15c3bd95a 100644
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -34,7 +34,7 @@
 
 
 class CapturedAttentionState(NamedTuple):
-    attn_metadata: dict[str, Any]
+    attn_metadata: dict[str, Any] | None
     slot_mappings: dict[str, torch.Tensor]
 
 
@@ -100,6 +100,7 @@ def __init__(
         self.decode_query_len = decode_query_len
 
         self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.tp_size = vllm_config.parallel_config.tensor_parallel_size
         self.is_first_pp_rank = get_pp_group().is_first_rank
         self.is_last_pp_rank = get_pp_group().is_last_rank
 
@@ -109,6 +110,10 @@ def __init__(
         self._graphs_captured = False
         self._candidates: list[list[BatchExecutionDescriptor]] = []
         self._capture_descs: dict[CUDAGraphMode, list[BatchExecutionDescriptor]] = {}
+        # adjust the cudagraph sizes to be a multiple of the uniform decode query length
+        self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+            self.decode_query_len, self.tp_size
+        )
         self._init_candidates()
 
     def _init_candidates(self) -> None:
@@ -340,16 +345,16 @@ def create_forward_fn(
                 block_tables,
                 attn_groups,
                 kv_cache_config,
+                skip_attn=(desc.cg_mode == CUDAGraphMode.PIECEWISE),
             )
 
             def forward_fn(cg_mode: CUDAGraphMode) -> None:
-                batch_descriptor = (
-                    BatchDescriptor(num_tokens=num_tokens)
-                    if cg_mode == CUDAGraphMode.PIECEWISE
-                    else None
-                )
+                batch_descriptor = None
+                if cg_mode == CUDAGraphMode.PIECEWISE:
+                    assert attn_metadata is None
+                    batch_descriptor = BatchDescriptor(num_tokens=num_tokens)
                 with set_forward_context(
-                    attn_metadata if cg_mode != CUDAGraphMode.PIECEWISE else None,
+                    attn_metadata,
                     self.vllm_config,
                     num_tokens=num_tokens,
                     cudagraph_runtime_mode=cg_mode,
@@ -419,6 +424,7 @@ def prepare_inputs_to_capture(
     block_tables: BlockTables,
     attn_groups: list[list[AttentionGroup]],
     kv_cache_config: KVCacheConfig,
+    skip_attn: bool = False,
 ) -> CapturedAttentionState:
     input_batch = InputBatch.make_dummy(num_reqs, num_tokens, input_buffers)
     input_block_tables = block_tables.get_dummy_block_tables(num_reqs)
@@ -439,13 +445,15 @@ def prepare_inputs_to_capture(
         )
         input_batch.dcp_local_seq_lens = input_buffers.dcp_local_seq_lens[:num_reqs]
 
-    attn_metadata = model_state.prepare_attn(
-        input_batch,
-        CUDAGraphMode.NONE,
-        input_block_tables,
-        slot_mappings,
-        attn_groups,
-        kv_cache_config,
-        for_capture=True,
-    )
+    attn_metadata = None
+    if not skip_attn:
+        attn_metadata = model_state.prepare_attn(
+            input_batch,
+            CUDAGraphMode.NONE,
+            input_block_tables,
+            slot_mappings,
+            attn_groups,
+            kv_cache_config,
+            for_capture=True,
+        )
     return CapturedAttentionState(attn_metadata, slot_mappings_by_layer)
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
index be14de272a42..b253d7d8c063 100644
--- a/vllm/v1/worker/gpu/input_batch.py
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -53,7 +53,10 @@ class InputBatch:
     # sum(num_scheduled_tokens)
     num_tokens: int
     num_tokens_after_padding: int
+    # Sum of draft tokens scheduled across requests.
     num_draft_tokens: int
+    # [num_reqs] number of draft tokens scheduled for each request, if any.
+    num_draft_tokens_per_req: np.ndarray | None
 
     # [num_reqs + 1]
     query_start_loc: torch.Tensor
@@ -64,6 +67,8 @@ class InputBatch:
     seq_lens_cpu_upper_bound: torch.Tensor
     # [num_reqs]
     dcp_local_seq_lens: torch.Tensor | None
+    # [num_reqs] CPU bool array.
+    is_prefilling_np: np.ndarray
 
     # [num_tokens_after_padding]
     input_ids: torch.Tensor
@@ -137,11 +142,13 @@ def make_dummy(
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens,
             num_draft_tokens=0,
+            num_draft_tokens_per_req=None,
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
             seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound,
             dcp_local_seq_lens=None,
+            is_prefilling_np=np.zeros(num_reqs, dtype=np.bool_),
             input_ids=input_ids,
             positions=positions,
             logits_indices=logits_indices,
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 98d889cdbb88..b81bd0dc59fc 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -44,10 +44,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
+from vllm.utils.math_utils import cdiv
 from vllm.utils.mem_utils import DeviceMemoryProfiler, format_gib
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
-from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.kv_cache_interface import KVCacheConfig, MambaSpec
 from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.gpu.async_utils import AsyncOutput, AsyncPoolingOutput
@@ -91,6 +92,7 @@
 from vllm.v1.worker.gpu.sample.output import SamplerOutput
 from vllm.v1.worker.gpu.sample.prompt_logprob import PromptLogprobsWorker
 from vllm.v1.worker.gpu.sample.sampler import Sampler
+from vllm.v1.worker.gpu.shutdown import free_before_shutdown
 from vllm.v1.worker.gpu.spec_decode import init_speculator
 from vllm.v1.worker.gpu.spec_decode.eagle.eagle3_utils import (
     set_eagle3_aux_hidden_state_layers,
@@ -337,10 +339,6 @@ def get_kv_cache_spec(self):
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         kv_cache_config = deepcopy(kv_cache_config)
         self.kv_cache_config = kv_cache_config
-        block_sizes = [
-            kv_cache_group.kv_cache_spec.block_size
-            for kv_cache_group in kv_cache_config.kv_cache_groups
-        ]
 
         block_table_max_model_len = self.max_model_len
         if self.is_encoder_decoder:
@@ -351,11 +349,34 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 getattr(self.model_config.hf_config, "max_source_positions", 0),
             )
 
+        block_sizes = []
+        max_num_blocks_per_group = []
+        for kv_cache_group in kv_cache_config.kv_cache_groups:
+            spec = kv_cache_group.kv_cache_spec
+            block_sizes.append(spec.block_size)
+            # When using DCP, each request's KV cache is sharded among different ranks.
+            # As a result, one block on the current rank covers `block_size * cp_size`
+            # tokens in the full, global (unsharded) sequence.
+            max_num_blocks = cdiv(
+                block_table_max_model_len, spec.block_size * self.dcp_size
+            )
+            # Align to a multiple of (128 / block_size) as required by some attention
+            # backends such as TRTLLM (#39324)
+            if spec.block_size <= 128:
+                alignment = 128 // spec.block_size
+                max_num_blocks = cdiv(max_num_blocks, alignment) * alignment
+            # For Mamba/Hybrid Model, KVCaches need extra blocks for speculative tokens
+            if isinstance(spec, MambaSpec):
+                max_num_blocks = (
+                    max_num_blocks if self.cache_config.enable_prefix_caching else 1
+                ) + spec.num_speculative_blocks
+            max_num_blocks_per_group.append(max_num_blocks)
+
         self.block_tables = BlockTables(
             block_sizes=block_sizes,
             max_num_reqs=self.max_num_reqs,
             max_num_batched_tokens=self.max_num_tokens,
-            max_model_len=block_table_max_model_len,
+            max_num_blocks_per_group=max_num_blocks_per_group,
             device=self.device,
             cp_size=self.dcp_size,
             cp_rank=self.dcp_rank,
@@ -721,6 +742,7 @@ def prepare_inputs(
 
         # Get the number of draft tokens for each request.
         draft_tokens = scheduler_output.scheduled_spec_decode_tokens
+        num_draft_tokens_per_req = None
         if not draft_tokens:
             # No draft token scheduled (common case).
             total_num_draft_tokens = 0
@@ -734,15 +756,15 @@ def prepare_inputs(
                 num_reqs, dtype=torch.int32, device=self.device
             )
         else:
-            num_draft_tokens = np.fromiter(
+            num_draft_tokens_per_req = np.fromiter(
                 (len(draft_tokens.get(req_id, ())) for req_id in req_ids),
                 dtype=np.int32,
                 count=num_reqs,
             )
-            total_num_draft_tokens = int(num_draft_tokens.sum())
+            total_num_draft_tokens = int(num_draft_tokens_per_req.sum())
             total_num_logits = num_reqs + total_num_draft_tokens
 
-            num_logits = num_draft_tokens + 1
+            num_logits = num_draft_tokens_per_req + 1
             cu_num_logits_np = np.empty(num_reqs + 1, dtype=np.int32)
             cu_num_logits_np[0] = 0
             np.cumsum(num_logits, out=cu_num_logits_np[1:])
@@ -765,9 +787,13 @@ def prepare_inputs(
         async_copy_to_gpu(query_start_loc_np, out=self.input_buffers.query_start_loc)
         query_start_loc_np = query_start_loc_np[: num_reqs_padded + 1]
         query_start_loc = self.input_buffers.query_start_loc[: num_reqs_padded + 1]
+        is_prefilling_np = (
+            self.req_states.num_computed_prefill_tokens[idx_mapping_np]
+            < self.req_states.prefill_len.np[idx_mapping_np]
+        )
 
         # Get prefill tokens if any.
-        if self.req_states.any_prefills(idx_mapping_np):
+        if np.any(is_prefilling_np):
             prepare_prefill_inputs(
                 self.input_buffers.input_ids,
                 self.req_states.next_prefill_tokens,
@@ -835,11 +861,13 @@ def prepare_inputs(
             num_tokens=num_tokens,
             num_tokens_after_padding=num_tokens_after_padding,
             num_draft_tokens=total_num_draft_tokens,
+            num_draft_tokens_per_req=num_draft_tokens_per_req,
             query_start_loc=query_start_loc,
             query_start_loc_np=query_start_loc_np,
             seq_lens=seq_lens,
             seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound,
             dcp_local_seq_lens=dcp_local_seq_lens,
+            is_prefilling_np=is_prefilling_np,
             input_ids=self.input_buffers.input_ids[:num_tokens_after_padding],
             positions=self.input_buffers.positions[:num_tokens_after_padding],
             logits_indices=logits_indices,
@@ -957,6 +985,8 @@ def postprocess(
             input_batch.num_scheduled_tokens
         )
 
+        self.model_state.postprocess_state(input_batch, num_sampled)
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -1339,6 +1369,24 @@ def postprocess_pool(self, input_batch: InputBatch) -> None:
             input_batch.num_scheduled_tokens
         )
 
+    def shutdown(self) -> None:
+        """Release GPU tensors (model weights, KV caches, workspace) so that
+        memory is reclaimable when running in the same process."""
+        torch.accelerator.synchronize()
+        if hasattr(self, "kv_caches"):
+            self.kv_caches.clear()
+        if hasattr(self, "attn_groups"):
+            self.attn_groups.clear()
+        if hasattr(self, "kv_cache_config"):
+            del self.kv_cache_config
+        free_before_shutdown(self.vllm_config)
+        if hasattr(self, "model"):
+            del self.model
+
+        gc.collect()
+        torch.accelerator.empty_cache()
+        logger.debug("Cleaned up model weights, KV caches, and workspace")
+
     ########### EPLB methods start ###########
     @property
     def eplb_state(self):
diff --git a/vllm/v1/worker/gpu/model_states/__init__.py b/vllm/v1/worker/gpu/model_states/__init__.py
index 651452553332..06b5a92c3952 100644
--- a/vllm/v1/worker/gpu/model_states/__init__.py
+++ b/vllm/v1/worker/gpu/model_states/__init__.py
@@ -18,6 +18,11 @@ def init_model_state(
 
         return WhisperModelState(vllm_config, model, encoder_cache, device)
 
+    if vllm_config.model_config.is_hybrid:
+        from vllm.v1.worker.gpu.model_states.mamba_hybrid import MambaHybridModelState
+
+        return MambaHybridModelState(vllm_config, model, encoder_cache, device)
+
     from vllm.v1.worker.gpu.model_states.default import DefaultModelState
 
     return DefaultModelState(vllm_config, model, encoder_cache, device)
diff --git a/vllm/v1/worker/gpu/model_states/default.py b/vllm/v1/worker/gpu/model_states/default.py
index 1b8ee066eeff..7f7955a58ab3 100644
--- a/vllm/v1/worker/gpu/model_states/default.py
+++ b/vllm/v1/worker/gpu/model_states/default.py
@@ -194,5 +194,6 @@ def prepare_attn(
             seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound,
             dcp_local_seq_lens=input_batch.dcp_local_seq_lens,
             positions=input_batch.positions,
+            for_cudagraph_capture=for_capture,
         )
         return attn_metadata
diff --git a/vllm/v1/worker/gpu/model_states/interface.py b/vllm/v1/worker/gpu/model_states/interface.py
index d83ab2fc515f..721e5c2013de 100644
--- a/vllm/v1/worker/gpu/model_states/interface.py
+++ b/vllm/v1/worker/gpu/model_states/interface.py
@@ -17,6 +17,24 @@
 from vllm.v1.worker.utils import AttentionGroup
 
 
+class ModelSpecificAttnMetadata:
+    """Base class for model-specific attention metadata."""
+
+    def get_extra_common_attn_kwargs(
+        self,
+        kv_cache_group_id: int,
+        num_reqs: int,
+    ) -> dict[str, Any]:
+        return {}
+
+    def get_extra_attn_kwargs(
+        self,
+        attn_metadata_builder: Any,
+        num_reqs: int,
+    ) -> dict[str, Any]:
+        return {}
+
+
 class ModelState(ABC):
     @abstractmethod
     def __init__(
@@ -38,6 +56,13 @@ def add_request(self, req_index: int, new_req_data: NewRequestData) -> None:
     def apply_staged_writes(self) -> None:
         return None
 
+    def postprocess_state(
+        self,
+        input_batch: InputBatch,
+        num_sampled: torch.Tensor,
+    ) -> None:
+        return None
+
     @abstractmethod
     def get_mm_embeddings(
         self,
diff --git a/vllm/v1/worker/gpu/model_states/mamba_hybrid.py b/vllm/v1/worker/gpu/model_states/mamba_hybrid.py
new file mode 100644
index 000000000000..93115fdf64dd
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_states/mamba_hybrid.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
+from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.input_batch import InputBatch
+from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
+from vllm.v1.worker.gpu.model_states.default import DefaultModelState
+from vllm.v1.worker.gpu.model_states.interface import ModelSpecificAttnMetadata
+from vllm.v1.worker.utils import AttentionGroup
+
+
+@dataclass
+class MambaHybridAttnMetadata(ModelSpecificAttnMetadata):
+    is_prefilling: torch.Tensor
+    num_accepted_tokens: torch.Tensor | None = None
+    num_decode_draft_tokens_cpu: torch.Tensor | None = None
+
+    def get_extra_common_attn_kwargs(
+        self,
+        kv_cache_group_id: int,
+        num_reqs: int,
+    ) -> dict[str, Any]:
+        return {"is_prefilling": self.is_prefilling[:num_reqs]}
+
+    def get_extra_attn_kwargs(
+        self,
+        attn_metadata_builder: Any,
+        num_reqs: int,
+    ) -> dict[str, Any]:
+        if not isinstance(
+            attn_metadata_builder,
+            (Mamba2AttentionMetadataBuilder, GDNAttentionMetadataBuilder),
+        ):
+            return {}
+        return {
+            "num_accepted_tokens": None
+            if self.num_accepted_tokens is None
+            else self.num_accepted_tokens[:num_reqs],
+            "num_decode_draft_tokens_cpu": None
+            if self.num_decode_draft_tokens_cpu is None
+            else self.num_decode_draft_tokens_cpu[:num_reqs],
+        }
+
+
+class MambaHybridModelState(DefaultModelState):
+    """Model state for hybrid attention + Mamba / linear-attention models."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        model: nn.Module,
+        encoder_cache: EncoderCache | None,
+        device: torch.device,
+    ) -> None:
+        super().__init__(vllm_config, model, encoder_cache, device)
+        self.num_accepted_tokens_gpu = torch.ones(
+            self.max_num_reqs, dtype=torch.int32, device=self.device
+        )
+
+    def prepare_attn(
+        self,
+        input_batch: InputBatch,
+        cudagraph_mode: CUDAGraphMode,
+        block_tables: tuple[torch.Tensor, ...],
+        slot_mappings: torch.Tensor,
+        attn_groups: list[list[AttentionGroup]],
+        kv_cache_config: KVCacheConfig,
+        for_capture: bool = False,
+    ) -> dict[str, Any]:
+        if cudagraph_mode == CUDAGraphMode.FULL:
+            num_reqs = input_batch.num_reqs_after_padding
+            num_tokens = input_batch.num_tokens_after_padding
+        else:
+            num_reqs = input_batch.num_reqs
+            num_tokens = input_batch.num_tokens
+        query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
+        max_query_len = input_batch.num_scheduled_tokens.max().item()
+
+        is_prefilling = torch.zeros(num_reqs, dtype=torch.bool, device="cpu")
+        is_prefilling[: input_batch.num_reqs] = torch.from_numpy(
+            input_batch.is_prefilling_np
+        )
+        # During CUDAGraph capture, num_decode_draft_tokens_cpu and num_accepted_tokens
+        # are created by attn_metadata_builder.build_for_cudagraph_capture, so we only
+        # compute them during actual (non-capture) forward execution.
+        num_accepted_tokens = None
+        num_decode_draft_tokens_cpu = None
+        if not for_capture:
+            num_accepted_tokens = self.num_accepted_tokens_gpu.new_ones(num_reqs)
+            num_accepted_tokens[: input_batch.num_reqs] = self.num_accepted_tokens_gpu[
+                input_batch.idx_mapping
+            ]
+
+            # GDN uses >= 0 to select spec-decode rows, so non-decode rows
+            # need the -1 sentinel rather than a raw zero draft count.
+            num_decode_draft_tokens_np = np.full(num_reqs, -1, dtype=np.int32)
+            if input_batch.num_draft_tokens_per_req is not None:
+                spec_decode_mask = (
+                    input_batch.num_draft_tokens_per_req > 0
+                ) & ~input_batch.is_prefilling_np
+                num_decode_draft_tokens_np[: input_batch.num_reqs] = np.where(
+                    spec_decode_mask,
+                    input_batch.num_draft_tokens_per_req,
+                    -1,
+                )
+            num_decode_draft_tokens_cpu = torch.from_numpy(num_decode_draft_tokens_np)
+
+        mamba_attn_metadata = MambaHybridAttnMetadata(
+            is_prefilling=is_prefilling,
+            num_accepted_tokens=num_accepted_tokens,
+            num_decode_draft_tokens_cpu=num_decode_draft_tokens_cpu,
+        )
+        return build_attn_metadata(
+            attn_groups=attn_groups,
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            query_start_loc_gpu=input_batch.query_start_loc,
+            query_start_loc_cpu=query_start_loc_cpu,
+            max_query_len=max_query_len,
+            seq_lens=input_batch.seq_lens,
+            max_seq_len=self.max_model_len,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=kv_cache_config,
+            dcp_local_seq_lens=input_batch.dcp_local_seq_lens,
+            model_specific_attn_metadata=mamba_attn_metadata,
+            for_cudagraph_capture=for_capture,
+        )
+
+    def postprocess_state(
+        self,
+        input_batch: InputBatch,
+        num_sampled: torch.Tensor,
+    ) -> None:
+        # Chunked prefill does not sample a token, so num_sampled can be 0.
+        # Mamba treats num_accepted_tokens=1 as the neutral non-spec value.
+        self.num_accepted_tokens_gpu[input_batch.idx_mapping] = torch.clamp(
+            num_sampled, min=1
+        )
diff --git a/vllm/v1/worker/gpu/model_states/whisper.py b/vllm/v1/worker/gpu/model_states/whisper.py
index a6faea482c25..0ef3cadc87ab 100644
--- a/vllm/v1/worker/gpu/model_states/whisper.py
+++ b/vllm/v1/worker/gpu/model_states/whisper.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
 from typing import Any
 
 import numpy as np
@@ -13,11 +14,33 @@
 from vllm.v1.worker.gpu.input_batch import InputBatch
 from vllm.v1.worker.gpu.mm.encoder_cache import EncoderCache
 from vllm.v1.worker.gpu.mm.encoder_runner import EncoderRunner
-from vllm.v1.worker.gpu.model_states.interface import ModelState
+from vllm.v1.worker.gpu.model_states.interface import (
+    ModelSpecificAttnMetadata,
+    ModelState,
+)
 from vllm.v1.worker.gpu.states import RequestState
 from vllm.v1.worker.utils import AttentionGroup
 
 
+@dataclass
+class WhisperAttnMetadata(ModelSpecificAttnMetadata):
+    encoder_seq_lens: dict[int, tuple[torch.Tensor, np.ndarray]]
+
+    def get_extra_common_attn_kwargs(
+        self,
+        kv_cache_group_id: int,
+        num_reqs: int,
+    ) -> dict[str, Any]:
+        encoder_seq_lens = self.encoder_seq_lens.get(kv_cache_group_id)
+        if encoder_seq_lens is None:
+            return {}
+        encoder_seq_lens_gpu, encoder_seq_lens_cpu = encoder_seq_lens
+        return {
+            "encoder_seq_lens": encoder_seq_lens_gpu[:num_reqs],
+            "encoder_seq_lens_cpu": encoder_seq_lens_cpu[:num_reqs],
+        }
+
+
 class WhisperModelState(ModelState):
     def __init__(
         self,
@@ -111,8 +134,8 @@ def prepare_attn(
         else:
             num_reqs = input_batch.num_reqs
             num_tokens = input_batch.num_tokens
-        encoder_seq_lens = self._get_encoder_seq_lens(
-            input_batch.req_ids, attn_groups, for_capture
+        whisper_attn_metadata = WhisperAttnMetadata(
+            self._get_encoder_seq_lens(input_batch.req_ids, attn_groups, for_capture)
         )
 
         query_start_loc_cpu = torch.from_numpy(input_batch.query_start_loc_np)
@@ -136,7 +159,8 @@ def prepare_attn(
             kv_cache_config=kv_cache_config,
             seq_lens_cpu_upper_bound=seq_lens_cpu_upper_bound,
             dcp_local_seq_lens=input_batch.dcp_local_seq_lens,
-            encoder_seq_lens=encoder_seq_lens,
+            model_specific_attn_metadata=whisper_attn_metadata,
+            for_cudagraph_capture=for_capture,
         )
         return attn_metadata
 
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
index 62912491492e..a02dd62026ad 100644
--- a/vllm/v1/worker/gpu/sample/gumbel.py
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -76,6 +76,8 @@ def gumbel_block_argmax(
     pos_ptr,
     processed_logits_ptr,
     processed_logits_stride,
+    processed_logits_col_ptr,
+    vocab_size,
     APPLY_TEMPERATURE: tl.constexpr,
 ):
     req_state_idx = tl.load(expanded_idx_mapping_ptr + token_idx)
@@ -88,8 +90,15 @@ def gumbel_block_argmax(
 
     if processed_logits_ptr is not None:
         # Store the temperature-applied logits.
+        if processed_logits_col_ptr is not None:
+            col = tl.load(processed_logits_col_ptr)
+        else:
+            col = 0
         tl.store(
-            processed_logits_ptr + req_state_idx * processed_logits_stride + block,
+            processed_logits_ptr
+            + req_state_idx * processed_logits_stride
+            + col * vocab_size
+            + block,
             logits,
             mask=mask,
         )
@@ -121,6 +130,7 @@ def _gumbel_sample_kernel(
     local_max_stride,
     processed_logits_ptr,
     processed_logits_stride,
+    processed_logits_col_ptr,
     logits_ptr,
     logits_stride,
     expanded_idx_mapping_ptr,
@@ -153,6 +163,8 @@ def _gumbel_sample_kernel(
         pos_ptr,
         processed_logits_ptr,
         processed_logits_stride,
+        processed_logits_col_ptr,
+        vocab_size,
         APPLY_TEMPERATURE=APPLY_TEMPERATURE,
     )
     token_id = block_idx * BLOCK_SIZE + idx
@@ -167,7 +179,8 @@ def gumbel_sample(
     seed: torch.Tensor,  # [max_num_reqs]
     pos: torch.Tensor,  # [num_tokens]
     apply_temperature: bool,
-    processed_logits_out: torch.Tensor | None = None,  # [num_reqs, vocab_size]
+    output_processed_logits: torch.Tensor | None = None,
+    output_processed_logits_col: torch.Tensor | None = None,
 ) -> torch.Tensor:
     num_tokens, vocab_size = logits.shape
     BLOCK_SIZE = 1024
@@ -179,8 +192,9 @@ def gumbel_sample(
         local_argmax.stride(0),
         local_max,
         local_max.stride(0),
-        processed_logits_out,
-        processed_logits_out.stride(0) if processed_logits_out is not None else 0,
+        output_processed_logits,
+        output_processed_logits.stride(0) if output_processed_logits is not None else 0,
+        output_processed_logits_col,
         logits,
         logits.stride(0),
         expanded_idx_mapping,
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
index 04adf9369233..057517479b51 100644
--- a/vllm/v1/worker/gpu/sample/penalties.py
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -58,8 +58,7 @@ def apply_staged_writes(self) -> None:
             idx_mapping = async_tensor_h2d(
                 self._new_penalties_reqs,
                 dtype=torch.int32,
-                target_device=self.device,
-                pin_memory=True,
+                device=self.device,
             )
 
             prefill_lens = self.req_states.prefill_len.np[self._new_penalties_reqs]
diff --git a/vllm/v1/worker/gpu/sample/prompt_logprob.py b/vllm/v1/worker/gpu/sample/prompt_logprob.py
index 11dbf6985279..baa48ebf900c 100644
--- a/vllm/v1/worker/gpu/sample/prompt_logprob.py
+++ b/vllm/v1/worker/gpu/sample/prompt_logprob.py
@@ -55,10 +55,8 @@ def compute_prompt_logprobs(
 
         num_prompt_logprobs = self.num_prompt_logprobs[idx_mapping_np]
         prompt_lens = prompt_lens[idx_mapping_np]
-        # NOTE(woosuk): -1 because the last prompt token's hidden state is not
-        # needed for prompt logprobs.
         computed_prefill = num_computed_prefill_tokens[idx_mapping_np]
-        includes_prompt = computed_prefill < prompt_lens - 1
+        includes_prompt = computed_prefill < prompt_lens
         # NOTE(woosuk): If the request was resumed after preemption, its prompt
         # logprobs must have been computed before preemption. Skip.
         resumed_after_prompt = prompt_lens < prefill_lens[idx_mapping_np]
diff --git a/vllm/v1/worker/gpu/shutdown.py b/vllm/v1/worker/gpu/shutdown.py
new file mode 100644
index 000000000000..830083962347
--- /dev/null
+++ b/vllm/v1/worker/gpu/shutdown.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def free_before_shutdown(vllm_config: VllmConfig) -> None:
+    from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
+    from vllm.v1.worker.workspace import reset_workspace_manager
+
+    cache_config = vllm_config.cache_config
+    cache_config.num_gpu_blocks = None
+
+    compilation_config = vllm_config.compilation_config
+    compilation_config.static_forward_context.clear()
+
+    _ROPE_DICT.clear()
+    reset_workspace_manager()
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle/cudagraph.py
index 8a29ccb39858..43bece01d0ec 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/cudagraph.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/cudagraph.py
@@ -106,6 +106,7 @@ def create_forward_fn(
                 block_tables,
                 attn_groups,
                 kv_cache_config,
+                skip_attn=(desc.cg_mode == CUDAGraphMode.PIECEWISE),
             )
             attn_metadata, slot_mappings = attn_state
 
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
index c6b0aa364f53..efe510f16e22 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -89,9 +89,13 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             dtype=torch.int64,
             device=device,
         )
+        self.current_draft_step = torch.tensor(0, dtype=torch.int64, device=device)
         self.last_token_indices = torch.zeros(
             self.max_num_reqs, dtype=torch.int64, device=device
         )
+        self.arange = torch.arange(
+            self.max_num_reqs + 1, dtype=torch.int32, device="cpu"
+        )
 
         self.supports_mm_inputs = MULTIMODAL_REGISTRY.supports_multimodal_inputs(
             self.draft_model_config
@@ -228,9 +232,10 @@ def _sample_draft(
         logits: torch.Tensor,
         idx_mapping: torch.Tensor,
         pos: torch.Tensor,
-        step: int,
+        draft_step: torch.Tensor,
+        draft_logits: torch.Tensor | None,
     ) -> torch.Tensor:
-        if self.draft_logits is not None:
+        if draft_logits is not None:
             # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
             # used for draft and target sampling.
             return gumbel_sample(
@@ -240,7 +245,8 @@ def _sample_draft(
                 self.seeds,
                 pos + 1,
                 apply_temperature=True,
-                processed_logits_out=self.draft_logits[:, step],
+                output_processed_logits=draft_logits,
+                output_processed_logits_col=draft_step,
             )
         else:
             return logits.argmax(dim=-1)
@@ -274,11 +280,63 @@ def prefill(
             logits,
             idx_mapping,
             pos,
-            step=0,
+            self.current_draft_step,
+            self.draft_logits,
         )
         self.hidden_states[:num_reqs] = hidden_states[last_token_indices]
         self.input_buffers.positions[:num_reqs] = pos
 
+    def multi_step_decode(
+        self,
+        num_reqs: int,
+        skip_attn: bool,
+        batch_desc: BatchExecutionDescriptor,
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        positions = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
+        idx_mapping = self.idx_mapping[:num_reqs]
+
+        for step in range(1, self.num_speculative_steps):
+            attn_metadata = None
+            slot_mappings_by_layer = None
+            if not skip_attn:
+                # Build attention metadata and slot mappings for each draft
+                # decode step. It is necessary to rebuild the attention
+                # metadata even when replaying the FULL graph so that any
+                # attention metadata builder state is updated.
+                slot_mappings = self.block_tables.compute_slot_mappings(
+                    idx_mapping,
+                    query_start_loc,
+                    positions,
+                    batch_desc.num_tokens,
+                )
+                slot_mappings_by_layer = build_slot_mappings_by_layer(
+                    slot_mappings, self.kv_cache_config
+                )
+                attn_metadata = self._build_draft_attn_metadata(
+                    num_reqs=num_reqs,
+                    num_reqs_padded=batch_desc.num_reqs or num_reqs,
+                    num_tokens_padded=batch_desc.num_tokens,
+                )
+
+            # Update the current draft step.
+            self.current_draft_step.fill_(step)
+
+            # Generate draft tokens for the current step.
+            if batch_desc.cg_mode == CUDAGraphMode.FULL:
+                assert self.decode_cudagraph_manager is not None
+                self.decode_cudagraph_manager.run_fullgraph(batch_desc)
+            else:
+                self.generate_draft(
+                    num_reqs,
+                    batch_desc.num_tokens,
+                    attn_metadata,
+                    slot_mappings_by_layer,
+                    num_tokens_across_dp=num_tokens_across_dp,
+                    cudagraph_runtime_mode=batch_desc.cg_mode,
+                )
+
     def generate_draft(
         self,
         num_reqs: int,
@@ -288,59 +346,52 @@ def generate_draft(
         num_tokens_across_dp: torch.Tensor | None,
         cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
     ) -> None:
-        pos = self.input_buffers.positions[:num_reqs]
-        query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
         idx_mapping = self.idx_mapping[:num_reqs]
-        for step in range(1, self.num_speculative_steps):
-            # Run the eagle model.
-            last_hidden_states, hidden_states = self.run_model(
-                num_tokens_padded,
-                attn_metadata,
-                slot_mappings,
-                num_tokens_across_dp,
-                cudagraph_runtime_mode,
-            )
-            last_hidden_states = last_hidden_states[:num_reqs]
-            hidden_states = hidden_states[:num_reqs]
-            logits = self.model.compute_logits(last_hidden_states)
+        positions = self.input_buffers.positions[:num_reqs]
+        # Run the eagle model forward pass.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens_padded,
+            attn_metadata,
+            slot_mappings,
+            num_tokens_across_dp,
+            cudagraph_runtime_mode,
+        )
+        last_hidden_states = last_hidden_states[:num_reqs]
 
-            draft_tokens = self._sample_draft(
-                logits,
-                idx_mapping,
-                pos,
-                step=step,
-            )
-            self.draft_tokens[:num_reqs, step] = draft_tokens
-
-            if step < self.num_speculative_steps - 1:
-                # Update the inputs for the next step.
-                update_eagle_inputs(
-                    draft_tokens,
-                    hidden_states,
-                    self.input_buffers,
-                    self.hidden_states,
-                    self.max_model_len,
-                )
-                if attn_metadata is not None:
-                    self.block_tables.compute_slot_mappings(
-                        idx_mapping, query_start_loc, pos, num_tokens_padded
-                    )
+        # Sample the draft tokens.
+        logits = self.model.compute_logits(last_hidden_states)
+        draft_tokens = self._sample_draft(
+            logits,
+            idx_mapping,
+            positions,
+            self.current_draft_step,
+            self.draft_logits,
+        )
+
+        # Update the inputs for the next step.
+        update_eagle_draft_inputs(
+            draft_tokens,
+            self.current_draft_step,
+            hidden_states,
+            self.draft_tokens,
+            self.hidden_states,
+            self.input_buffers,
+            num_reqs,
+            self.max_model_len,
+            self.num_speculative_steps,
+        )
 
     def _build_draft_attn_metadata(
         self,
         num_reqs: int,
         num_reqs_padded: int,
         num_tokens_padded: int,
-        max_query_len: int,
     ) -> dict[str, Any] | None:
         if not self.draft_attn_layer_names:
             return None
 
-        query_start_loc_cpu = (
-            torch.arange(num_reqs_padded + 1, dtype=torch.int32, device="cpu").clamp_(
-                max=num_reqs
-            )
-            * max_query_len
+        query_start_loc_cpu = torch.clamp(
+            self.arange[: num_reqs_padded + 1], max=num_reqs
         )
         block_tables = [
             x[:num_reqs_padded] for x in self.block_tables.input_block_tables
@@ -354,7 +405,7 @@ def _build_draft_attn_metadata(
                 : num_reqs_padded + 1
             ],
             query_start_loc_cpu=query_start_loc_cpu,
-            max_query_len=max_query_len,
+            max_query_len=1,
             seq_lens=self.input_buffers.seq_lens[:num_reqs_padded],
             max_seq_len=self.max_model_len,
             block_tables=block_tables,
@@ -373,7 +424,7 @@ def capture(
         self.last_token_indices.zero_()
 
         # Capture the prefill routine (model forward + compute_logits +
-        # gumbel_sample).
+        # sample).
         # For FULL graphs, the entire routine is recorded as one graph.
         # For PIECEWISE, only the model's compiled regions are captured
         # and the rest (compute_logits, gumbel_sample) runs eagerly.
@@ -387,10 +438,9 @@ def capture(
         if self.num_speculative_steps == 1:
             return
 
-        # Capture the decode draft generation loop (model forward +
-        # compute_logits + gumbel_sample + update_eagle_inputs, for
-        # each step). For FULL graphs, the entire multi-step loop is
-        # recorded as one graph.
+        # Capture the decode draft generation routine (model forward +
+        # compute_logits + sample + update_eagle_inputs) for a single
+        # step.
         assert self.decode_cudagraph_manager is not None
         self.decode_cudagraph_manager.capture(
             self.generate_draft,
@@ -461,9 +511,10 @@ def propose(
 
         # Get the input ids and last token indices for the speculator.
         prepare_eagle_inputs(
+            self.last_token_indices,
+            self.current_draft_step,
             self.input_buffers,
             input_batch,
-            self.last_token_indices,
             num_sampled,
             num_rejected,
             last_sampled,
@@ -473,12 +524,18 @@ def propose(
 
         # When all requests are decoding (no true prefills), each has
         # num_speculative_steps + 1 tokens, enabling FULL graph replay.
-        # Mixed or prefill-only batches fall back to PIECEWISE.
+        uniform_token_count = get_uniform_token_count(
+            num_reqs,
+            # Use the actual number of tokens without padding added by
+            # the target model during FULL cudagraph.
+            input_batch.num_tokens,
+            max_query_len,
+        )
         prefill_batch_desc, num_tokens_across_dp = dispatch_cg_and_sync_dp(
             self.prefill_cudagraph_manager,
             num_reqs,
             num_tokens,
-            get_uniform_token_count(num_reqs, num_tokens, max_query_len),
+            uniform_token_count,
             dp_size=self.dp_size,
             dp_rank=self.dp_rank,
             need_eager=is_profile,
@@ -528,48 +585,21 @@ def propose(
             need_eager=is_profile,
         )
 
-        attn_metadata_updated = None
-        slot_mappings_updated = None
-        if not (dummy_run and skip_attn_for_dummy_run):
-            # Build attention metadata and slot mappings for the draft
-            # decode steps. It is necessary to rebuild the attention
-            # metadata even when replaying the FULL graph so that any
-            # attention metadata builder state is updated.
-            slot_mappings = self.block_tables.compute_slot_mappings(
-                self.idx_mapping[:num_reqs],
-                self.input_buffers.query_start_loc[: num_reqs + 1],
-                self.input_buffers.positions[:num_reqs],
-                decode_batch_desc.num_tokens,
-            )
-            slot_mappings_updated = build_slot_mappings_by_layer(
-                slot_mappings, self.kv_cache_config
-            )
-            attn_metadata_updated = self._build_draft_attn_metadata(
-                num_reqs=num_reqs,
-                num_reqs_padded=decode_batch_desc.num_reqs or num_reqs,
-                num_tokens_padded=decode_batch_desc.num_tokens,
-                max_query_len=1,
-            )
+        # Generate the remaining num_speculative_steps - 1 draft tokens.
+        self.multi_step_decode(
+            num_reqs,
+            dummy_run and skip_attn_for_dummy_run,
+            decode_batch_desc,
+            num_tokens_across_dp,
+        )
 
-        if decode_batch_desc.cg_mode == CUDAGraphMode.FULL:
-            # Replay the full graph for draft generation.
-            assert self.decode_cudagraph_manager is not None
-            self.decode_cudagraph_manager.run_fullgraph(decode_batch_desc)
-        else:
-            self.generate_draft(
-                num_reqs,
-                decode_batch_desc.num_tokens,
-                attn_metadata_updated,
-                slot_mappings_updated,
-                num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=decode_batch_desc.cg_mode,
-            )
         return self.draft_tokens[:num_reqs]
 
 
 @triton.jit
 def _prepare_eagle_inputs_kernel(
     last_token_indices_ptr,
+    eagle_current_draft_step_ptr,
     eagle_input_ids_ptr,
     eagle_positions_ptr,
     eagle_query_start_loc_ptr,
@@ -630,6 +660,8 @@ def _prepare_eagle_inputs_kernel(
     # Copy sequence lengths.
     tl.store(eagle_seq_lens_ptr + req_idx, seq_len)
     if req_idx == (num_reqs - 1):
+        # Reset the current draft step to 0.
+        tl.store(eagle_current_draft_step_ptr, 0)
         # Pad query_start_loc for CUDA graphs.
         for i in range(num_reqs, max_num_reqs + 1, BLOCK_SIZE):
             block = i + tl.arange(0, BLOCK_SIZE)
@@ -648,10 +680,11 @@ def _prepare_eagle_inputs_kernel(
 
 
 def prepare_eagle_inputs(
-    input_buffers: InputBuffers,
-    input_batch: InputBatch,
     # [num_reqs]
     last_token_indices: torch.Tensor,
+    current_draft_step: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_batch: InputBatch,
     # [num_reqs]
     num_sampled: torch.Tensor,
     # [num_reqs]
@@ -665,6 +698,7 @@ def prepare_eagle_inputs(
     num_reqs = input_batch.num_reqs
     _prepare_eagle_inputs_kernel[(num_reqs,)](
         last_token_indices,
+        current_draft_step,
         input_buffers.input_ids,
         input_buffers.positions,
         input_buffers.query_start_loc,
@@ -685,7 +719,7 @@ def prepare_eagle_inputs(
 
 
 @triton.jit
-def _prepare_eagle_docode_kernel(
+def _prepare_eagle_decode_kernel(
     draft_tokens_ptr,
     draft_tokens_stride,
     target_seq_lens_ptr,
@@ -742,7 +776,7 @@ def prepare_eagle_decode(
     max_num_reqs: int,
 ):
     num_reqs = draft_tokens.shape[0]
-    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+    _prepare_eagle_decode_kernel[(num_reqs + 1,)](
         draft_tokens,
         draft_tokens.stride(0),
         target_seq_lens,
@@ -758,36 +792,55 @@ def prepare_eagle_decode(
 
 
 @triton.jit
-def _update_eagle_inputs_kernel(
+def _update_eagle_draft_inputs_kernel(
+    output_draft_tokens_ptr,
+    output_draft_tokens_stride,
+    next_input_hidden_states_ptr,
+    next_input_hidden_states_stride,
     input_ids_ptr,
     positions_ptr,
-    input_hidden_states_ptr,
-    input_hidden_states_stride,
     seq_lens_ptr,
-    max_model_len,
     draft_tokens_ptr,
-    output_hidden_states_ptr,
-    output_hidden_states_stride,
+    current_draft_step_ptr,
+    hidden_states_ptr,
+    hidden_states_stride,
     hidden_size,
+    max_model_len,
+    num_speculative_steps,
     BLOCK_SIZE: tl.constexpr,
 ):
     req_idx = tl.program_id(0)
 
-    # Draft token -> Input ID.
+    # Write the sampled draft token into self.draft_tokens[req_idx, step].
     draft_token = tl.load(draft_tokens_ptr + req_idx)
+    step = tl.load(current_draft_step_ptr)
+    tl.store(
+        output_draft_tokens_ptr + req_idx * output_draft_tokens_stride + step,
+        draft_token,
+    )
+
+    if step >= num_speculative_steps - 1:
+        # This is the final step. Skip updating draft forward inputs.
+        return
+
+    # Write the sampled draft token into the input ids tensor for the next
+    # forward pass.
     tl.store(input_ids_ptr + req_idx, draft_token)
 
-    # Output hidden states -> Input hidden states.
+    # Copy hidden states into the input hidden states tensor for the next
+    # forward pass.
     for i in range(0, hidden_size, BLOCK_SIZE):
         block = i + tl.arange(0, BLOCK_SIZE)
         mask = block < hidden_size
-        output_hidden_states = tl.load(
-            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+        hidden_states = tl.load(
+            hidden_states_ptr + req_idx * hidden_states_stride + block,
             mask=mask,
         )
         tl.store(
-            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
-            output_hidden_states,
+            next_input_hidden_states_ptr
+            + req_idx * next_input_hidden_states_stride
+            + block,
+            hidden_states,
             mask=mask,
         )
 
@@ -803,24 +856,32 @@ def _update_eagle_inputs_kernel(
     tl.store(seq_lens_ptr + req_idx, seq_len)
 
 
-def update_eagle_inputs(
+def update_eagle_draft_inputs(
     draft_tokens: torch.Tensor,
-    output_hidden_states: torch.Tensor,
-    input_buffers: InputBuffers,
+    current_draft_step: torch.Tensor,
     hidden_states: torch.Tensor,
+    output_draft_tokens: torch.Tensor,
+    next_input_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    num_reqs: int,
     max_model_len: int,
+    num_speculative_steps: int,
 ):
-    num_reqs, hidden_size = output_hidden_states.shape
-    _update_eagle_inputs_kernel[(num_reqs,)](
+    _, hidden_size = hidden_states.shape
+    _update_eagle_draft_inputs_kernel[(num_reqs,)](
+        output_draft_tokens,
+        output_draft_tokens.stride(0),
+        next_input_hidden_states,
+        next_input_hidden_states.stride(0),
         input_buffers.input_ids,
         input_buffers.positions,
-        hidden_states,
-        hidden_states.stride(0),
         input_buffers.seq_lens,
-        max_model_len,
         draft_tokens,
-        output_hidden_states,
-        output_hidden_states.stride(0),
+        current_draft_step,
+        hidden_states,
+        hidden_states.stride(0),
         hidden_size,
+        max_model_len,
+        num_speculative_steps,
         BLOCK_SIZE=1024,
     )
diff --git a/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py b/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
index 9d86372e624b..10b29433efb2 100644
--- a/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
+++ b/vllm/v1/worker/gpu/spec_decode/probabilistic_rejection_sampler_utils.py
@@ -392,8 +392,10 @@ def _resample_kernel(
         temp_ptr,
         seed_ptr,
         pos_ptr,
-        None,
-        0,
+        None,  # processed_logits_ptr
+        0,  # processed_logits_stride
+        None,  # processed_logits_col_ptr
+        vocab_size,
         APPLY_TEMPERATURE=False,
     )
     token_id = block_idx * BLOCK_SIZE + idx
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 75898f463272..e9de08342b82 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -49,6 +49,8 @@ class CachedRequestState:
 
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
+    # To accumulate prompt logprobs tensor chunks across prefill steps.
+    in_progress_prompt_logprobs_cpu: LogprobsTensors | None = None
 
     # Per-position mask for mixed-mode inputs (e.g chat completion with
     # prompt_embeds content parts). See `Request.prompt_is_token_ids`.
@@ -255,9 +257,6 @@ def __init__(
         # More efficient than num_logprobs=-1 when only a few tokens are needed
         self.logprob_token_ids: dict[str, list[int]] = {}
 
-        # To accumulate prompt logprobs tensor chunks across prefill steps.
-        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
-
         # Internal representation of per-step batch state changes, used for
         # reordering persistent batch and generating logitsprocs batch state
         # updates. Should reset each step.
@@ -505,6 +504,7 @@ def update_req_spec_token_ids(
         start_index = self.num_tokens_no_spec[req_index]
         end_token_index = start_index + num_spec_tokens
         self.token_ids_cpu[req_index, start_index:end_token_index] = spec_token_ids
+        self.is_token_ids[req_index, start_index:end_token_index] = True
         cur_spec_token_ids.extend(spec_token_ids)
 
     def remove_request(self, req_id: str) -> int | None:
@@ -552,7 +552,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.logprob_token_ids.pop(req_id, None)
-        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
         if self.prev_req_id_to_index is not None:
             self.prev_req_id_to_index.pop(req_id, None)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bcab2ca2d4c2..3a55f9e1439c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -54,7 +54,11 @@
 from vllm.model_executor.layers.attention import Attention, MLAAttention
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
-    RoutedExpertsCapturer,
+    extract_routed_experts_for_current_batch,
+    free_routing_buffers,
+    get_global_experts_capturer,
+    init_routed_experts_capturer_with_shared_cache,
+    issue_routing_d2h_copy,
 )
 from vllm.model_executor.layers.mamba.ops.ssu_dispatch import (
     initialize_mamba_ssu_backend,
@@ -143,6 +147,7 @@
     KVCacheGroupSpec,
     KVCacheSpec,
     MambaSpec,
+    MemoryModel,
     SlidingWindowSpec,
     UniformTypeKVCacheSpecs,
 )
@@ -169,6 +174,7 @@
 from vllm.v1.spec_decode.draft_model import DraftModelProposer
 from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesProposer
+from vllm.v1.spec_decode.gemma4 import Gemma4Proposer
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer_gpu import (
@@ -188,6 +194,7 @@
 )
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin
+from vllm.v1.worker.gpu.attn_utils import get_block_layout_page_size_bytes
 from vllm.v1.worker.gpu.pool.late_interaction_runner import LateInteractionRunner
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
@@ -524,6 +531,7 @@ def __init__(
                 | DraftModelProposer
                 | MedusaProposer
                 | ExtractHiddenStatesProposer
+                | Gemma4Proposer
             )
             if self.speculative_config.method == "ngram":
                 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -552,6 +560,8 @@ def __init__(
                 self._ngram_pinned_val_buf = torch.zeros(
                     self.max_num_reqs, dtype=torch.int32, pin_memory=True
                 )
+            elif self.speculative_config.use_gemma4_mtp():
+                self.drafter = Gemma4Proposer(self.vllm_config, self.device, self)
             elif self.speculative_config.use_dflash():
                 self.drafter = DFlashProposer(self.vllm_config, self.device, self)
                 self.use_aux_hidden_state_outputs = True
@@ -1088,6 +1098,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None
         for req_id in scheduler_output.finished_req_ids:
             self.input_batch.remove_request(req_id)
 
+        if self.routed_experts_initialized:
+            free_routing_buffers(
+                scheduler_output.finished_req_ids,
+                scheduler_output.preempted_req_ids,
+            )
+
         # Zero GPU memory for freshly allocated cache blocks to prevent
         # stale NaN/data from corrupting attention or SSM computation.
         if scheduler_output.new_block_ids_to_zero:
@@ -1343,13 +1359,27 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> Callable | None
             # For the last rank, we don't need to update the token_ids_cpu
             # because the sampled tokens are already cached.
             if not is_last_rank:
-                # Add new_token_ids to token_ids_cpu.
-                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + len(new_token_ids)
-                self.input_batch.token_ids_cpu[
-                    req_index, start_token_index:end_token_index
-                ] = new_token_ids
-                self.input_batch.num_tokens_no_spec[req_index] = end_token_index
+                start_token_index = self.input_batch.num_tokens_no_spec[req_index]
+                # For chunked prefill, num_computed_tokens may less
+                # than num_tokens_no_spec.
+                # Async scheduled PP: no new_token_ids, advance num_tokens_no_spec
+                # according to num_computed_tokens.
+                end_token_index = max(
+                    start_token_index,
+                    num_computed_tokens + len(new_token_ids),
+                )
+                if end_token_index > start_token_index:
+                    if new_token_ids:
+                        # Add new_token_ids to token_ids_cpu.
+                        num_new_tokens = end_token_index - start_token_index
+                        tokens_to_append = new_token_ids[-num_new_tokens:]
+                        self.input_batch.token_ids_cpu[
+                            req_index, start_token_index:end_token_index
+                        ] = tokens_to_append
+                    self.input_batch.is_token_ids[
+                        req_index, start_token_index:end_token_index
+                    ] = True
+                    self.input_batch.num_tokens_no_spec[req_index] = end_token_index
 
             # Add spec_token_ids to token_ids_cpu.
             self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens)
@@ -2157,10 +2187,6 @@ def _get_block_table(kv_cache_gid: int):
         block_table_gid_0 = _get_block_table(0)
         slot_mapping_gid_0 = slot_mappings[0]
 
-        if self.routed_experts_initialized:
-            attn_gid = self.routed_experts_attn_gid
-            slot_mapping_attn = slot_mappings[attn_gid]
-            self.slot_mapping = slot_mapping_attn[:num_tokens].cpu().numpy()
         num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
             :num_reqs_padded
         ]
@@ -2310,11 +2336,18 @@ def _build_attn_group_metadata(
                 cm.slot_mapping = slot_mappings[kv_cache_gid]
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
-                if isinstance(self.drafter, (EagleProposer, DFlashProposer)):
+                if isinstance(
+                    self.drafter, (EagleProposer, DFlashProposer, Gemma4Proposer)
+                ):
                     if self.drafter.kv_cache_gid == kv_cache_gid:
                         spec_decode_common_attn_metadata = cm
                 else:
                     spec_decode_common_attn_metadata = cm
+            # Capture per-group block tables for multi-group proposers.
+            if self.speculative_config and isinstance(self.drafter, Gemma4Proposer):
+                self.drafter.set_per_group_block_table(
+                    kv_cache_gid, cm.block_table_tensor
+                )
 
             for attn_gid in range(len(self.attn_groups[kv_cache_gid])):
                 if ubatch_slices is not None:
@@ -3834,11 +3867,9 @@ def execute_model(
             )
 
         if self.routed_experts_initialized:
-            capturer = RoutedExpertsCapturer.get_instance()
+            capturer = get_global_experts_capturer()
             if capturer is not None:
-                capturer.clear_buffer()  # noqa
-            else:
-                logger.error("RoutedExpertsCapturer not initialized.")
+                capturer.finalize_pending_copy()
 
         # If ngram_gpu is used, we need to copy the scheduler_output to avoid
         # the modification has influence on the scheduler_output in engine core process.
@@ -4182,7 +4213,7 @@ def sample_tokens(
             kv_connector_output = self.kv_connector_output
             self.kv_connector_output = None
             # receive sampled token ids from the last PP rank.
-            if self.use_async_scheduling and get_pp_group().world_size > 1:
+            if self.use_async_scheduling and not get_pp_group().is_last_rank:
                 self._pp_receive_prev_sampled_token_ids_to_input_batch()
             if not kv_connector_output:
                 return None  # type: ignore[return-value]
@@ -4276,7 +4307,8 @@ def propose_draft_token_ids(sampled_token_ids):
                     EagleProposer
                     | DFlashProposer
                     | DraftModelProposer
-                    | ExtractHiddenStatesProposer,
+                    | ExtractHiddenStatesProposer
+                    | Gemma4Proposer,
                 )
                 sampled_token_ids = sampler_output.sampled_token_ids
                 if input_fits_in_drafter:
@@ -4347,6 +4379,14 @@ def propose_draft_token_ids(sampled_token_ids):
                 scheduler_output.total_num_scheduled_tokens,
             )
 
+        if self.routed_experts_initialized:
+            issue_routing_d2h_copy(
+                input_batch_req_ids=self.input_batch.req_ids,
+                num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
+                positions=self.positions,
+                positions_cpu=self._positions_cpu,
+            )
+
         if propose_drafts_after_bookkeeping:
             # ngram and other speculative decoding methods use the sampled
             # tokens on the CPU, so they are run after bookkeeping.
@@ -4366,12 +4406,15 @@ def propose_draft_token_ids(sampled_token_ids):
         self.kv_connector_output = None
 
         with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"):
+            routed_experts_dict = None
             if self.routed_experts_initialized:
-                capturer = RoutedExpertsCapturer.get_instance()
-                if capturer is not None:
-                    capturer.save_captured_experts(indices=self.slot_mapping)  # noqa
-                else:
-                    logger.error("RoutedExpertsCapturer not initialized.")
+                routed_experts_dict = extract_routed_experts_for_current_batch(
+                    req_ids=req_ids_output_copy,
+                    requests=self.requests,
+                    req_id_to_index=self.input_batch.req_id_to_index,
+                    num_tokens_no_spec=self.input_batch.num_tokens_no_spec,
+                    max_model_len=self.max_model_len,
+                )
 
             output = ModelRunnerOutput(
                 req_ids=req_ids_output_copy,
@@ -4385,6 +4428,7 @@ def propose_draft_token_ids(sampled_token_ids):
                 else None,
                 num_nans_in_logits=num_nans_in_logits,
                 cudagraph_stats=cudagraph_stats,
+                routed_experts_dict=routed_experts_dict,
             )
 
         if not self.use_async_scheduling:
@@ -4672,7 +4716,8 @@ def propose_draft_token_ids(
             or spec_config.uses_draft_model()
         ):
             assert isinstance(
-                self.drafter, EagleProposer | DFlashProposer | DraftModelProposer
+                self.drafter,
+                EagleProposer | DFlashProposer | DraftModelProposer | Gemma4Proposer,
             )
 
             if spec_config.disable_padded_drafter_batch:
@@ -5094,7 +5139,6 @@ def _get_prompt_logprobs_dict(
         if not num_prompt_logprobs_dict:
             return {}
 
-        in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
         prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
@@ -5118,14 +5162,14 @@ def _get_prompt_logprobs_dict(
             )
 
             # Set up target LogprobsTensors object.
-            logprobs_tensors = in_progress_dict.get(req_id)
-            if not logprobs_tensors:
+            logprobs_tensors = request.in_progress_prompt_logprobs_cpu
+            if logprobs_tensors is None:
                 # Create empty logprobs CPU tensors for the entire prompt.
                 # If chunked, we'll copy in slice by slice.
                 logprobs_tensors = LogprobsTensors.empty_cpu(
                     num_prompt_tokens - 1, num_prompt_logprobs + 1
                 )
-                in_progress_dict[req_id] = logprobs_tensors
+                request.in_progress_prompt_logprobs_cpu = logprobs_tensors
 
             # Determine number of logits to retrieve.
             start_idx = request.num_computed_tokens
@@ -5182,7 +5226,7 @@ def _get_prompt_logprobs_dict(
         # num_prompt_logprobs_dict.
         for req_id in completed_prefill_reqs:
             del num_prompt_logprobs_dict[req_id]
-            del in_progress_dict[req_id]
+            self.requests[req_id].in_progress_prompt_logprobs_cpu = None
 
         # Must synchronize the non-blocking GPU->CPU transfers.
         if prompt_logprobs_dict:
@@ -5595,7 +5639,8 @@ def _dummy_run(
                     EagleProposer
                     | DFlashProposer
                     | DraftModelProposer
-                    | ExtractHiddenStatesProposer,
+                    | ExtractHiddenStatesProposer
+                    | Gemma4Proposer,
                 )
                 assert self.speculative_config is not None
                 # Eagle currently only supports PIECEWISE cudagraphs.
@@ -6111,6 +6156,7 @@ def capture_model(self) -> int:
                 "Skipping CUDA graph capture. To turn on CUDA graph capture, "
                 "ensure `cudagraph_mode` was not manually set to `NONE`"
             )
+            self.init_routed_experts_capturer()
             return 0
 
         # Initialize encoder CUDA graph manager if enabled.
@@ -6144,6 +6190,13 @@ def capture_model(self) -> int:
 
         start_time = time.perf_counter()
 
+        # Initialize the routed experts capturer once before any CUDA graph
+        # capture.  Must happen before graphs are captured so the buffer
+        # address is baked into the graph.  Do NOT call this inside
+        # _capture_cudagraphs() -- creating the capturer twice replaces
+        # the device buffer, causing graphs to write to a dead buffer.
+        self.init_routed_experts_capturer()
+
         # Trigger CUDA graph capture for specific shapes.
         # Capture the large shapes first so that the smaller shapes
         # can reuse the memory pool allocated for the large shapes.
@@ -6396,7 +6449,8 @@ def initialize_metadata_builders(
             or self.speculative_config.uses_draft_model()
         ):
             assert isinstance(
-                self.drafter, EagleProposer | DFlashProposer | DraftModelProposer
+                self.drafter,
+                EagleProposer | DFlashProposer | DraftModelProposer | Gemma4Proposer,
             )
             self.drafter.initialize_attn_backend(kv_cache_config, kernel_block_sizes)
 
@@ -6449,7 +6503,10 @@ def _check_and_update_cudagraph_mode(
         ):
             assert isinstance(
                 self.drafter,
-                EagleProposer | DFlashProposer | ExtractHiddenStatesProposer,
+                EagleProposer
+                | DFlashProposer
+                | ExtractHiddenStatesProposer
+                | Gemma4Proposer,
             )
             self.drafter.initialize_cudagraph_keys(cudagraph_mode)
 
@@ -6648,9 +6705,15 @@ def _reshape_kv_cache_tensors(
                 if layer_name in self.runner_only_attn_layers:
                     continue
                 raw_tensor = kv_cache_raw_tensors[layer_name]
-                assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
-                num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
+                block_layout_page_size = get_block_layout_page_size_bytes(kv_cache_spec)
+                assert raw_tensor.numel() % block_layout_page_size == 0
+                num_blocks = raw_tensor.numel() // block_layout_page_size
                 if isinstance(kv_cache_spec, AttentionSpec):
+                    if kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT:
+                        raise NotImplementedError(
+                            "REQUEST_CONSTANT AttentionSpec is not supported. "
+                            "Attention KV cache is token-proportional."
+                        )
                     has_attn = True
                     num_blocks_per_kv_block = (
                         kv_cache_spec.block_size // kernel_block_size
@@ -6701,7 +6764,7 @@ def _reshape_kv_cache_tensors(
                         # standard attention backends whose shape starts with
                         # a K/V dimension of size 2.
                         dtype_size = get_dtype_size(dtype)
-                        page_stride = kv_cache_spec.page_size_bytes // dtype_size
+                        page_stride = block_layout_page_size // dtype_size
                         strides = list(torch.empty(kv_cache_shape).stride())
                         strides[inv_order[0]] = page_stride
                         kv_cache = torch.as_strided(
@@ -6721,9 +6784,7 @@ def _reshape_kv_cache_tensors(
                     storage_offset_bytes = 0
                     for shape, dtype in zip(kv_cache_spec.shapes, kv_cache_spec.dtypes):
                         dtype_size = get_dtype_size(dtype)
-                        num_element_per_page = (
-                            kv_cache_spec.page_size_bytes // dtype_size
-                        )
+                        num_element_per_page = block_layout_page_size // dtype_size
                         target_shape = (num_blocks, *shape)
                         stride = torch.empty(target_shape).stride()
                         target_stride = (num_element_per_page, *stride[1:])
@@ -6934,45 +6995,40 @@ def init_routed_experts_capturer(self):
             "Initializing routed experts capturer, enable_return_routed_experts: %s",
             self.model_config.enable_return_routed_experts,
         )
-        routed_experts_capturer = RoutedExpertsCapturer.create()
-        self.routed_experts_attn_gid = self._get_attention_kv_cache_gid()
-        min_block_size = min(
-            [
-                group.kv_cache_spec.block_size
-                for group in self.kv_cache_config.kv_cache_groups
-            ]
-        )
-        num_groups = len(self.kv_cache_config.kv_cache_groups)
-        self.max_num_kv_tokens = (
-            self.kv_cache_config.num_blocks // num_groups
-        ) * min_block_size
-        dcp_size = self.vllm_config.parallel_config.decode_context_parallel_size
-        pcp_size = self.vllm_config.parallel_config.prefill_context_parallel_size
-        if pcp_size * dcp_size > 1:
-            self.max_num_kv_tokens *= pcp_size * dcp_size
+        from vllm.distributed import get_tp_group
+
+        if hasattr(self.model_config.hf_text_config, "n_shared_experts"):
+            num_fused_shared_experts = 1
+        else:
+            num_fused_shared_experts = 0
 
-        routed_experts_capturer.init_buffer(
+        tp_group = get_tp_group()
+        init_routed_experts_capturer_with_shared_cache(
+            enable=self.model_config.enable_return_routed_experts,
+            model_config=self.model_config,
+            num_fused_shared_experts=num_fused_shared_experts,
             max_num_batched_tokens=self.scheduler_config.max_num_batched_tokens,
-            max_num_kv_tokens=self.max_num_kv_tokens,
-            vllm_config=self.vllm_config,
+            max_model_len=self.max_model_len,
+            device=self.device,
+            rank=tp_group.rank_in_group,
+            world_size=tp_group.world_size,
         )
-        self._bind_routed_experts_capturer(routed_experts_capturer)
+        self._bind_routed_experts_capturer()
         self.routed_experts_initialized = True
 
-    def _bind_routed_experts_capturer(self, capturer: RoutedExpertsCapturer) -> None:
-        from vllm.model_executor.layers.fused_moe.layer import FusedMoE
-        from vllm.model_executor.layers.fused_moe.router.base_router import (
-            BaseRouter,
+        # Pinned CPU buffer for async positions D2H (avoids sync .cpu() call)
+        self._positions_cpu = torch.empty(
+            self.scheduler_config.max_num_batched_tokens,
+            dtype=torch.long,
+            pin_memory=True,
         )
 
-        for module in self.compilation_config.static_forward_context.values():
-            if isinstance(module, FusedMoE) and isinstance(module.router, BaseRouter):
-                layer_id = module.layer_id
-
-                def _capture_fn(topk_ids, _layer_id=layer_id, _capturer=capturer):
-                    _capturer.capture(_layer_id, topk_ids)
+    def _bind_routed_experts_capturer(self) -> None:
+        from vllm.model_executor.layers.fused_moe.routed_experts_capturer import (
+            bind_routing_capture_to_model,
+        )
 
-                module.router.set_capture_fn(_capture_fn)
+        bind_routing_capture_to_model(self.model)
 
     def may_add_encoder_only_layers_to_kv_cache_config(self) -> None:
         """
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 36dd5ed7fc65..2c89fd04cfb1 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -398,7 +398,7 @@ def determine_available_memory(self) -> int:
             # differently and can produce incorrect/negative estimates.
             cudagraph_memory_estimate = 0
             if (
-                not current_platform.is_rocm()
+                current_platform.is_cuda()
                 and self.vllm_config.compilation_config.cudagraph_mode
                 != CUDAGraphMode.NONE
             ):
@@ -711,6 +711,14 @@ def compile_or_warm_up_model(self) -> CompilationTimes:
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
 
+        # All warmup is done — start monitoring for unexpected JIT
+        # compilations that would cause latency spikes during inference.
+        from vllm.triton_utils.jit_monitor import (
+            activate as activate_triton_jit_monitor,
+        )
+
+        activate_triton_jit_monitor()
+
         return CompilationTimes(
             language_model=self.compilation_config.compilation_time,
             encoder=self.compilation_config.encoder_compilation_time,
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index 4fc1aff94fed..524e7b3315ef 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -18,12 +18,13 @@
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.v1.attention.backend import AttentionBackend
-from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, MemoryModel
 from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     KVConnectorOutput,
     ModelRunnerOutput,
 )
+from vllm.v1.worker.gpu.attn_utils import get_block_layout_page_size_bytes
 from vllm.v1.worker.utils import AttentionGroup
 
 if TYPE_CHECKING:
@@ -219,6 +220,15 @@ def allocate_uniform_kv_caches(
         attn_group = attn_groups[0][0]
         kv_cache_spec = attn_group.kv_cache_spec
         assert isinstance(kv_cache_spec, AttentionSpec)
+        if any(
+            group.kv_cache_spec.memory_model == MemoryModel.REQUEST_CONSTANT
+            for groups in attn_groups
+            for group in groups
+        ):
+            raise NotImplementedError(
+                "Cross-layer KV connector does not support REQUEST_CONSTANT "
+                "specs. Multi-pool connector support is out of scope."
+            )
 
         tensor_sizes = set(
             kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors
@@ -226,7 +236,7 @@ def allocate_uniform_kv_caches(
         assert len(tensor_sizes) == 1
         tensor_size = tensor_sizes.pop()
 
-        page_size = kv_cache_spec.page_size_bytes
+        page_size = get_block_layout_page_size_bytes(kv_cache_spec)
         assert tensor_size % page_size == 0
         num_blocks = tensor_size // page_size
         num_layers = len(kv_cache_config.kv_cache_tensors)
diff --git a/vllm/v1/worker/ubatch_utils.py b/vllm/v1/worker/ubatch_utils.py
index 1338b46996fc..f4a76529023c 100644
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@@ -214,7 +214,9 @@ def _make_metadata_with_slice(
             seq_lens_cpu_upper_bound[-1] -= tokens_skipped
 
     assert seq_lens_cpu_upper_bound is not None
-    max_seq_len = int(seq_lens_cpu_upper_bound.max())
+    # Preserve the max_seq_len override set during CUDA-graph capture so
+    # the attention backend selects the correct kernel for SWA layers.
+    max_seq_len = max(int(seq_lens_cpu_upper_bound.max()), attn_metadata.max_seq_len)
 
     num_requests = request_slice.stop - request_slice.start
     num_actual_tokens = token_slice.stop - token_slice.start