diff --git a/.gitignore b/.gitignore
index fb5fbd86..04a9a266 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,6 +47,7 @@ configs/etcdctl
 configs/*.whl
 configs/*.deb
 configs/*.tar.gz
+configs/wheels/
 
 .ruff_cache/
 *.egg-info/
diff --git a/configs/install-ai-dynamo.sh b/configs/install-ai-dynamo.sh
new file mode 100755
index 00000000..a005d5eb
--- /dev/null
+++ b/configs/install-ai-dynamo.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+DYNAMO_VERSION="${DYNAMO_VERSION:-}"
+
+if [ -z "${DYNAMO_VERSION}" ]; then
+    echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel install" >&2
+    exit 1
+fi
+
+DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}"
+DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}"
+DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}"
+DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}"
+DYNAMO_WHEEL_DIRS="${DYNAMO_WHEEL_DIRS:-/configs/wheels /configs}"
+PYTHON_BIN="${PYTHON_BIN:-}"
+
+if [ -z "${PYTHON_BIN}" ]; then
+    if command -v python3 >/dev/null 2>&1; then
+        PYTHON_BIN="python3"
+    elif command -v python >/dev/null 2>&1; then
+        PYTHON_BIN="python"
+    else
+        echo "ERROR: neither python3 nor python found in PATH" >&2
+        exit 127
+    fi
+fi
+
+if "${PYTHON_BIN}" - <<PY
+import importlib.metadata
+import sys
+
+wanted = "${DYNAMO_VERSION}"
+packages = ("ai-dynamo", "ai-dynamo-runtime")
+for package in packages:
+    try:
+        installed = importlib.metadata.version(package)
+    except importlib.metadata.PackageNotFoundError:
+        sys.exit(1)
+    if installed != wanted:
+        sys.exit(1)
+
+import dynamo.llm  # noqa: F401
+
+sys.exit(0)
+PY
+then
+    echo "ai-dynamo and ai-dynamo-runtime ${DYNAMO_VERSION} already installed"
+    exit 0
+fi
+
+find_wheel() {
+    local pattern="$1"
+    local wheel_dir
+    for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
+        [ -d "${wheel_dir}" ] || continue
+        find "${wheel_dir}" -maxdepth 1 -type f -name "${pattern}" -print -quit
+    done
+}
+
+dynamo_wheel="$(find_wheel "${DYNAMO_WHEEL_NAME}")"
+runtime_wheel="$(find_wheel "${DYNAMO_RUNTIME_WHEEL_PATTERN}")"
+
+find_links_args=()
+for wheel_dir in ${DYNAMO_WHEEL_DIRS}; do
+    [ -d "${wheel_dir}" ] || continue
+    find_links_args+=(--find-links "${wheel_dir}")
+done
+
+if [ -n "${dynamo_wheel}" ] && [ -n "${runtime_wheel}" ]; then
+    echo "Installing ai-dynamo-runtime and ai-dynamo ${DYNAMO_VERSION} from local wheels"
+    "${PYTHON_BIN}" -m pip install \
+        --pre \
+        --no-deps \
+        --no-index \
+        "${find_links_args[@]}" \
+        "${DYNAMO_RUNTIME_PACKAGE}" \
+        "${DYNAMO_PACKAGE}"
+else
+    echo "ERROR: exact ai-dynamo wheels for ${DYNAMO_VERSION} were not found in ${DYNAMO_WHEEL_DIRS}" >&2
+    echo "ERROR: expected ${DYNAMO_WHEEL_NAME} and ${DYNAMO_RUNTIME_WHEEL_PATTERN}" >&2
+    exit 1
+fi
+
+"${PYTHON_BIN}" - <<'PY'
+import dynamo.llm  # noqa: F401
+PY
diff --git a/configs/patches/vllm_cumem_expandable_segments_fix.py b/configs/patches/vllm_cumem_expandable_segments_fix.py
new file mode 100644
index 00000000..04474195
--- /dev/null
+++ b/configs/patches/vllm_cumem_expandable_segments_fix.py
@@ -0,0 +1,169 @@
+"""
+Patch vLLM's CuMemAllocator to be compatible with PyTorch expandable
+segments by temporarily toggling the allocator setting around the memory
+pool context (sleep mode), instead of hard-asserting at __init__ time.
+
+Backports vllm-project/vllm#40812 ("Auto-disable expandable_segments
+around cumem memory pool"). Without this patch, setting
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True together with
+enable-sleep-mode causes vLLM to abort during CuMemAllocator
+construction; with this patch, expandable segments stay on for normal
+allocations and are flipped off only for the duration of
+use_memory_pool().
+
+Reference: https://github.com/vllm-project/vllm/pull/40812
+Affected file: vllm/device_allocator/cumem.py
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path("/usr/local/lib/python3.12/dist-packages/vllm/device_allocator/cumem.py")
+
+# Idempotency: the new use_memory_pool body introduces this exact line.
+MARKER = 'expandable_was_enabled = "expandable_segments:True" in conf'
+
+# --- Hunk 1: drop the __init__ assertion -------------------------------------
+
+INIT_OLD = (
+    "    def __init__(self):\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        assert "expandable_segments:True" not in conf, (\n'
+    '            "Expandable segments are not compatible with memory pool. "\n'
+    '            "Please track https://github.com/pytorch/pytorch/issues/147851 "\n'
+    '            "for the latest updates."\n'
+    "        )\n"
+    "\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+INIT_NEW = (
+    "    def __init__(self):\n"
+    "        self.pointer_to_data: dict[int, AllocationData] = {}\n"
+)
+
+# --- Hunk 2: wrap use_memory_pool body in try/finally + toggle ---------------
+
+POOL_OLD = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        with use_memory_pool_with_allocator(\n"
+    "            self.python_malloc_callback, self.python_free_callback\n"
+    "        ) as data:\n"
+    "            # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "            # possibly because of gc-related issue w.r.t. the allocator and\n"
+    "            # the memory pool.\n"
+    "            # to avoid the issue, we keep a reference of the data.\n"
+    "            # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "            self.allocator_and_pools[tag] = data\n"
+    "            yield\n"
+    "            # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "            # when using pluggable allocator, see\n"
+    "            # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "            # if we have some memory allocated and then freed,\n"
+    "            # the memory will not be released, e.g. in online quantization,\n"
+    "            # where the model is created in higher precision, and then\n"
+    "            # quantized in lower precision.\n"
+    "            # Find all unused allocations and manually release them.\n"
+    "            # TODO: we should expose `empty_cache` method in the memory pool.\n"
+    "            # TODO: ask for help from PyTorch team to expose this method.\n"
+    "            allocations = data[0].snapshot()\n"
+    "            for allocation in allocations:\n"
+    "                if allocation[\"allocated_size\"] == 0:\n"
+    "                    handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                    unmap_and_release(handle)\n"
+    "            self.current_tag = old_tag\n"
+)
+
+POOL_NEW = (
+    "        assert isinstance(tag, str)\n"
+    "\n"
+    "        # Expandable segments are incompatible with the memory pool used for\n"
+    "        # sleep mode (see https://github.com/pytorch/pytorch/issues/147851).\n"
+    "        # If the user has enabled expandable segments via\n"
+    "        # PYTORCH_CUDA_ALLOC_CONF, temporarily disable them for the duration\n"
+    "        # of the memory pool context and restore on exit.\n"
+    '        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")\n'
+    '        expandable_was_enabled = "expandable_segments:True" in conf\n'
+    "        if expandable_was_enabled:\n"
+    '            torch.cuda.memory._set_allocator_settings("expandable_segments:False")\n'
+    "\n"
+    "        old_tag = self.current_tag\n"
+    "        self.current_tag = tag\n"
+    "        try:\n"
+    "            with use_memory_pool_with_allocator(\n"
+    "                self.python_malloc_callback, self.python_free_callback\n"
+    "            ) as data:\n"
+    "                # start to hit another PyTorch bug in PyTorch 2.6,\n"
+    "                # possibly because of gc-related issue w.r.t. the allocator\n"
+    "                # and the memory pool.\n"
+    "                # to avoid the issue, we keep a reference of the data.\n"
+    "                # see https://github.com/pytorch/pytorch/issues/146431 .\n"
+    "                self.allocator_and_pools[tag] = data\n"
+    "                yield\n"
+    "                # PyTorch's bug, calling torch.cuda.empty_cache() will error\n"
+    "                # when using pluggable allocator, see\n"
+    "                # https://github.com/pytorch/pytorch/issues/145168 .\n"
+    "                # if we have some memory allocated and then freed,\n"
+    "                # the memory will not be released, e.g. in online\n"
+    "                # quantization, where the model is created in higher\n"
+    "                # precision, and then quantized in lower precision.\n"
+    "                # Find all unused allocations and manually release them.\n"
+    "                # TODO: we should expose `empty_cache` method in the memory\n"
+    "                # pool.\n"
+    "                # TODO: ask for help from PyTorch team to expose this method.\n"
+    "                allocations = data[0].snapshot()\n"
+    "                for allocation in allocations:\n"
+    "                    if allocation[\"allocated_size\"] == 0:\n"
+    "                        handle = self._python_free_callback(allocation[\"address\"])\n"
+    "                        unmap_and_release(handle)\n"
+    "        finally:\n"
+    "            self.current_tag = old_tag\n"
+    "            if expandable_was_enabled:\n"
+    '                torch.cuda.memory._set_allocator_settings("expandable_segments:True")\n'
+)
+
+PATCHES = [
+    ("CuMemAllocator.__init__ assertion removal", INIT_OLD, INIT_NEW),
+    ("CuMemAllocator.use_memory_pool toggle", POOL_OLD, POOL_NEW),
+]
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-cumem-expandable-fix] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+    if MARKER in content:
+        print("[vllm-cumem-expandable-fix] Already patched, skipping.", file=sys.stderr)
+        return
+
+    new_content = content
+    for name, old, new in PATCHES:
+        count = new_content.count(old)
+        if count == 0:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} not found. "
+                "vLLM version may have drifted; inspect cumem.py.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if count > 1:
+            print(
+                f"[vllm-cumem-expandable-fix] Anchor for {name!r} is ambiguous "
+                f"({count} matches); refusing to patch.",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        new_content = new_content.replace(old, new, 1)
+        print(f"[vllm-cumem-expandable-fix] Patched {name}", file=sys.stderr)
+
+    TARGET.write_text(new_content)
+    print("[vllm-cumem-expandable-fix] Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
new file mode 100644
index 00000000..44cbbb40
--- /dev/null
+++ b/configs/patches/vllm_deepseek_v4_mega_moe_free_orig.py
@@ -0,0 +1,111 @@
+"""
+Free original DeepSeek V4 MoE expert weights after MegaMoE finalize.
+
+Symptom (seen on GB200 decode, EP=8, VLLM_DEEPSEEK_V4_USE_MEGA_MOE=1):
+    torch.OutOfMemoryError: Tried to allocate 1008.00 MiB.
+    GPU 0 has a total capacity of 184.31 GiB of which 381.44 MiB is free.
+    181.02 GiB allocated by PyTorch.
+  Stack ends in deep_gemm/mega/__init__.py interleave():
+    torch.empty_like(t).copy_(torch.stack([gate, up], dim=2).reshape(...))
+
+Root cause: DeepseekV4MegaMoEExperts.finalize_weights() builds
+self._transformed_l1_weights / _transformed_l2_weights but does NOT release
+the original self.w13_weight / w2_weight / *_weight_scale parameters. Both
+copies stay resident on GPU through finalize iteration, and on EP=8 the
+per-rank weight footprint (~125 GiB) plus this duplication leaves no
+headroom for the per-layer interleave temporaries (~1 GiB peak).
+
+Forward path verified (deepseek_v4.py: _run_mega_moe, ~line 538-547) only
+reads self._transformed_l1_weights / _transformed_l2_weights. Original
+w13_weight / w2_weight / *_weight_scale are dead after finalize.
+
+Fix (mirrors upstream PR vllm-project/vllm#40860): at the end of
+finalize_weights() of each expert module, drop the four original
+Parameters by assigning them to None so they are removed from the module's
+_parameters dict. transform_weights_for_mega_moe allocates fresh L1 + SF
+tensors and only the L2 weight aliases the original w2_weight storage --
+_transformed_l2_weights still holds that reference, so the storage stays
+live via refcount. PyTorch's caching allocator can then reuse the freed
+storage for the NEXT layer's interleave temporaries within the same
+finalize loop.
+
+Reference: vllm/model_executor/models/deepseek_v4.py,
+DeepseekV4MegaMoEExperts.finalize_weights().
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path(
+    "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v4.py"
+)
+
+# Idempotency marker
+MARKER = "srt-slurm-sa hotfix: free original MegaMoE expert weights"
+
+# Anchor: closing of the _transformed_l1/l2 assignment in finalize_weights().
+# The triple-`)` pattern is unique in the file.
+OLD = (
+    "        self._transformed_l1_weights, self._transformed_l2_weights = (\n"
+    "            deep_gemm.transform_weights_for_mega_moe(\n"
+    "                (self.w13_weight.data.view(torch.int8).contiguous(), w13_scale),\n"
+    "                (self.w2_weight.data.view(torch.int8).contiguous(), w2_scale),\n"
+    "            )\n"
+    "        )\n"
+)
+
+NEW = (
+    OLD
+    + "        # srt-slurm-sa hotfix: free original MegaMoE expert weights.\n"
+    + "        # Mirrors vllm-project/vllm#40860. transform_weights_for_mega_moe\n"
+    + "        # allocates fresh L1 + SF tensors; only the L2 weight aliases the\n"
+    + "        # original w2_weight storage, but _transformed_l2_weights holds that\n"
+    + "        # reference, so dropping the Parameters is safe via refcount and the\n"
+    + "        # freed storage returns to the caching allocator in time for the next\n"
+    + "        # layer's interleave temp (~1 GiB).\n"
+    + "        self.w13_weight = None\n"
+    + "        self.w13_weight_scale = None\n"
+    + "        self.w2_weight = None\n"
+    + "        self.w2_weight_scale = None\n"
+)
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-mega-moe-free-orig] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+
+    if MARKER in content:
+        print("[vllm-mega-moe-free-orig] Already patched, skipping.", file=sys.stderr)
+        return
+
+    count = content.count(OLD)
+    if count == 0:
+        print(
+            "[vllm-mega-moe-free-orig] Could not find finalize_weights anchor. "
+            "vLLM version may have drifted; inspect "
+            "DeepseekV4MegaMoEExperts.finalize_weights().",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    if count > 1:
+        print(
+            f"[vllm-mega-moe-free-orig] Anchor is ambiguous ({count} occurrences); "
+            "refusing to patch.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    content = content.replace(OLD, NEW)
+    TARGET.write_text(content)
+    print(
+        "[vllm-mega-moe-free-orig] Freed original w13/w2 weights and scales "
+        "in DeepseekV4MegaMoEExperts.finalize_weights().",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/patches/vllm_numa_bind_hash_fix.py b/configs/patches/vllm_numa_bind_hash_fix.py
new file mode 100644
index 00000000..0759238c
--- /dev/null
+++ b/configs/patches/vllm_numa_bind_hash_fix.py
@@ -0,0 +1,84 @@
+"""
+Patch vLLM's ParallelConfig.compute_hash to exclude NUMA-bind fields
+(numa_bind / numa_bind_nodes / numa_bind_cpus) from the DP consistency hash.
+
+Symptom (seen on GB300, 1 worker, DP=4, numa-bind=True):
+    RuntimeError: Configuration mismatch detected for engine 3.
+    All DP workers must have identical configurations for parameters that
+    affect collective communication ...
+
+Root cause: when numa-bind is enabled, each DP rank auto-detects and stores
+its own per-rank NUMA node in ParallelConfig.numa_bind_nodes. These per-rank
+values enter compute_hash(), so ranks on different NUMA nodes produce
+different hashes and fail the DP startup check. NUMA binding affects only
+host-side memory locality, not collective-communication semantics, so it is
+safe to exclude from the DP hash.
+
+Reference: vllm/config/parallel.py, ParallelConfig.compute_hash(),
+ignored_factors set.
+"""
+
+import sys
+from pathlib import Path
+
+TARGET = Path(
+    "/usr/local/lib/python3.12/dist-packages/vllm/config/parallel.py"
+)
+
+# Idempotency: if any of our additions is already present, skip.
+MARKER = '"numa_bind",'
+
+# Anchor: the last entry of the existing ignored_factors set in the
+# upstream compute_hash method. We insert the three numa fields just
+# before the closing brace.
+OLD = '            "_api_process_rank",\n        }'
+
+NEW = (
+    '            "_api_process_rank",\n'
+    '            # srt-slurm-sa hotfix: numa-bind fields are per-rank runtime\n'
+    '            # topology, not collective-communication semantics.\n'
+    '            "numa_bind",\n'
+    '            "numa_bind_nodes",\n'
+    '            "numa_bind_cpus",\n'
+    '        }'
+)
+
+
+def main():
+    if not TARGET.exists():
+        print(f"[vllm-numa-bind-hash-fix] Target not found: {TARGET}", file=sys.stderr)
+        sys.exit(1)
+
+    content = TARGET.read_text()
+
+    if MARKER in content:
+        print("[vllm-numa-bind-hash-fix] Already patched, skipping.", file=sys.stderr)
+        return
+
+    count = content.count(OLD)
+    if count == 0:
+        print(
+            "[vllm-numa-bind-hash-fix] Could not find ignored_factors anchor. "
+            "vLLM version may have drifted; inspect ParallelConfig.compute_hash().",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    if count > 1:
+        print(
+            f"[vllm-numa-bind-hash-fix] Anchor is ambiguous ({count} occurrences); "
+            "refusing to patch.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    content = content.replace(OLD, NEW)
+    TARGET.write_text(content)
+    print(
+        "[vllm-numa-bind-hash-fix] Added numa_bind/numa_bind_nodes/numa_bind_cpus "
+        "to ParallelConfig.compute_hash ignored_factors.",
+        file=sys.stderr,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/patches/vllm_nvlink_one_sided_bf16_fix.py b/configs/patches/vllm_nvlink_one_sided_bf16_fix.py
new file mode 100644
index 00000000..7f19ff8e
--- /dev/null
+++ b/configs/patches/vllm_nvlink_one_sided_bf16_fix.py
@@ -0,0 +1,345 @@
+"""
+Patch vLLM to backport Inferact/vllm-svf#180 — bf16 activation support for
+the FlashInfer NVLink one-sided MoE all-to-all path.
+
+Without the patch, FlashInferNVLinkOneSidedPrepareAndFinalize hard-codes the
+dispatch payload to nvfp4 (0.5 B/elem hidden + per-16-elem fp8 scales). That
+crashes for experts that prefer to receive bf16 tokens and quantize
+post-dispatch (e.g. trtllm_mxfp4_moe with mxfp8 activations) and for any
+non-nvfp4 quant_dtype.
+
+Affected files (from PR diff):
+  - vllm/distributed/device_communicators/all2all.py
+  - vllm/model_executor/layers/fused_moe/all2all_utils.py
+  - vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+  - vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
+
+Reference: https://github.com/Inferact/vllm-svf/pull/180
+"""
+
+import sys
+from pathlib import Path
+
+VLLM_ROOT = Path("/usr/local/lib/python3.12/dist-packages/vllm")
+
+# --- File 1: distributed/device_communicators/all2all.py ----------------------
+
+ALL2ALL_TARGET = VLLM_ROOT / "distributed/device_communicators/all2all.py"
+
+ALL2ALL_OLD = (
+    "        top_k: int,\n"
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "    ):\n"
+    '        """Initialize the MoeAlltoAll workspace."""\n'
+    "        if self.initialized:\n"
+    "            return\n"
+)
+
+ALL2ALL_NEW = (
+    "        top_k: int,\n"
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        dispatch_dtype_bytes_per_elem: int = 0,\n"
+    "        dispatch_has_fp8_scale: bool = True,\n"
+    "    ):\n"
+    '        """Initialize the MoeAlltoAll workspace.\n'
+    "\n"
+    "        dispatch_dtype_bytes_per_elem: bytes/elem for the dispatched hidden\n"
+    "            states. Use 0 as a sentinel for sub-byte nvfp4 (0.5 B/elem); use\n"
+    "            1 for fp8, 2 for bf16/fp16.\n"
+    "        dispatch_has_fp8_scale: whether a per-16-elem fp8 scale tensor is\n"
+    "            dispatched alongside the hidden states (true for nvfp4/fp8,\n"
+    "            false for bf16 passthrough).\n"
+    '        """\n'
+    "        if self.initialized:\n"
+    "            return\n"
+)
+
+ALL2ALL_OLD_PAYLOAD = (
+    "        total_dispatch_payload_size_per_token = (\n"
+    "            hidden_size // 2  # nvfp4 hidden states\n"
+    "            + hidden_size // 16  # fp8 scaling factors\n"
+    "            + top_k * 4  # int32 topks ids\n"
+    "            + top_k * 4  # float32 topk weights\n"
+    "        )\n"
+)
+
+ALL2ALL_NEW_PAYLOAD = (
+    "        if dispatch_dtype_bytes_per_elem == 0:\n"
+    "            hidden_bytes = hidden_size // 2  # nvfp4\n"
+    "        else:\n"
+    "            hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem\n"
+    "        scale_bytes = hidden_size // 16 if dispatch_has_fp8_scale else 0\n"
+    "        total_dispatch_payload_size_per_token = (\n"
+    "            hidden_bytes\n"
+    "            + scale_bytes\n"
+    "            + top_k * 4  # int32 topks ids\n"
+    "            + top_k * 4  # float32 topk weights\n"
+    "        )\n"
+)
+
+# --- File 2: fused_moe/all2all_utils.py ---------------------------------------
+
+ALL2ALL_UTILS_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/all2all_utils.py"
+
+ALL2ALL_UTILS_OLD_SIG = (
+    "    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n"
+    "    allow_new_interface: bool = False,\n"
+    "    use_monolithic: bool = False,\n"
+    ") -> FusedMoEPrepareAndFinalize | None:\n"
+)
+
+ALL2ALL_UTILS_NEW_SIG = (
+    "    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n"
+    "    allow_new_interface: bool = False,\n"
+    "    use_monolithic: bool = False,\n"
+    "    defer_input_quant: bool = False,\n"
+    ") -> FusedMoEPrepareAndFinalize | None:\n"
+)
+
+ALL2ALL_UTILS_OLD_BUILD = (
+    "        prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n"
+    "            max_num_tokens=max_num_tokens,\n"
+    "            top_k=moe.experts_per_token,\n"
+    "            num_experts=moe.num_experts,\n"
+    "            hidden_size=moe.hidden_dim,\n"
+    "            num_dispatchers=all2all_manager.world_size,\n"
+    "        )\n"
+)
+
+ALL2ALL_UTILS_NEW_BUILD = (
+    "        if defer_input_quant or quant_config.quant_dtype is None:\n"
+    "            # Experts (e.g. trtllm_mxfp4 with mxfp8 activations) quantize\n"
+    "            # post-dispatch; ship bf16 tokens with no per-token scale payload.\n"
+    "            dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 2, False\n"
+    '        elif quant_config.quant_dtype == "nvfp4":\n'
+    "            dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 0, True\n"
+    "        else:\n"
+    "            raise NotImplementedError(\n"
+    '                "flashinfer_nvlink_one_sided dispatch only supports nvfp4, "\n'
+    '                "bf16, and defer_input_quant paths today; got "\n'
+    '                f"quant_dtype={quant_config.quant_dtype!r}"\n'
+    "            )\n"
+    "        prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n"
+    "            max_num_tokens=max_num_tokens,\n"
+    "            top_k=moe.experts_per_token,\n"
+    "            num_experts=moe.num_experts,\n"
+    "            hidden_size=moe.hidden_dim,\n"
+    "            num_dispatchers=all2all_manager.world_size,\n"
+    "            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n"
+    "            dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n"
+    "        )\n"
+)
+
+# --- File 3: fused_moe/oracle/mxfp4.py ----------------------------------------
+
+MXFP4_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/oracle/mxfp4.py"
+
+MXFP4_OLD = (
+    '    """Create a FusedMoEKernel for the given MXFP4 backend."""\n'
+    "    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n"
+    "\n"
+    "    # Create Prepare/Finalize.\n"
+    "    prepare_finalize = maybe_make_prepare_finalize(\n"
+    "        moe=moe_config,\n"
+)
+
+MXFP4_NEW = (
+    '    """Create a FusedMoEKernel for the given MXFP4 backend."""\n'
+    "    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n"
+    "\n"
+    "    # Some experts (trtllm_mxfp4 with mxfp8 activations) prefer bf16 tokens\n"
+    "    # on dispatch and quantize internally; signal this to the prepare/finalize\n"
+    "    # so workspace + prepare path ship bf16 instead of the quant_config dtype.\n"
+    "    from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (\n"
+    "        TrtLlmMxfp4ExpertsBase,\n"
+    "    )\n"
+    "\n"
+    "    defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)\n"
+    "\n"
+    "    # Create Prepare/Finalize.\n"
+    "    prepare_finalize = maybe_make_prepare_finalize(\n"
+    "        moe=moe_config,\n"
+)
+
+MXFP4_OLD_CALL = (
+    "        routing_tables=routing_tables,\n"
+    "        allow_new_interface=True,\n"
+    "        use_monolithic=is_monolithic,\n"
+    "    )\n"
+    "    assert prepare_finalize is not None\n"
+)
+
+MXFP4_NEW_CALL = (
+    "        routing_tables=routing_tables,\n"
+    "        allow_new_interface=True,\n"
+    "        use_monolithic=is_monolithic,\n"
+    "        defer_input_quant=defer_input_quant,\n"
+    "    )\n"
+    "    assert prepare_finalize is not None\n"
+)
+
+# --- File 4: fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py --------
+
+PREP_TARGET = VLLM_ROOT / (
+    "model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py"
+)
+
+PREP_OLD_INIT = (
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        num_dispatchers: int = 1,\n"
+    "    ):\n"
+    "        super().__init__()\n"
+)
+
+PREP_NEW_INIT = (
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        num_dispatchers: int = 1,\n"
+    "        dispatch_dtype_bytes_per_elem: int = 0,\n"
+    "        dispatch_has_fp8_scale: bool = True,\n"
+    "    ):\n"
+    "        super().__init__()\n"
+)
+
+PREP_OLD_CALL = (
+    "            top_k=self.top_k,\n"
+    "            num_experts=self.num_experts,\n"
+    "            hidden_size=self.hidden_size,\n"
+    "        )\n"
+)
+
+PREP_NEW_CALL = (
+    "            top_k=self.top_k,\n"
+    "            num_experts=self.num_experts,\n"
+    "            hidden_size=self.hidden_size,\n"
+    "            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n"
+    "            dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n"
+    "        )\n"
+)
+
+PREP_OLD_QUANT = (
+    "        a1q, a1q_scale = moe_kernel_quantize_input(\n"
+    "            a1,\n"
+    "            quant_config.a1_gscale,\n"
+    "            quant_config.quant_dtype,\n"
+    "            quant_config.per_act_token_quant,\n"
+    "            quant_config.block_shape,\n"
+    "            is_fp4_scale_swizzled=False,  # delay swizzle to after comm\n"
+    "        )\n"
+)
+
+PREP_NEW_QUANT = (
+    "        if defer_input_quant:\n"
+    "            # Experts (e.g. trtllm_mxfp4_moe with mxfp8 activations) will\n"
+    "            # quantize post-dispatch. Ship bf16 tokens and skip scales.\n"
+    "            a1q, a1q_scale = a1, None\n"
+    "        else:\n"
+    "            a1q, a1q_scale = moe_kernel_quantize_input(\n"
+    "                a1,\n"
+    "                quant_config.a1_gscale,\n"
+    "                quant_config.quant_dtype,\n"
+    "                quant_config.per_act_token_quant,\n"
+    "                quant_config.block_shape,\n"
+    "                is_fp4_scale_swizzled=False,  # delay swizzle to after comm\n"
+    "            )\n"
+)
+
+# (target file, marker indicating already-patched, [(name, old, new), ...])
+FILES = [
+    (
+        ALL2ALL_TARGET,
+        "dispatch_dtype_bytes_per_elem",
+        [
+            ("MoeAlltoAll.initialize signature", ALL2ALL_OLD, ALL2ALL_NEW),
+            ("MoeAlltoAll dispatch payload sizing", ALL2ALL_OLD_PAYLOAD, ALL2ALL_NEW_PAYLOAD),
+        ],
+    ),
+    (
+        ALL2ALL_UTILS_TARGET,
+        # Note: bare "defer_input_quant" appears in a comment in the base
+        # file ("# Unquantized dispatch (e.g. AITER with defer_input_quant):"),
+        # so we anchor on a string we *introduce* — namely the parameter
+        # declaration in maybe_make_prepare_finalize's signature.
+        "defer_input_quant: bool = False,",
+        [
+            ("maybe_make_prepare_finalize signature", ALL2ALL_UTILS_OLD_SIG, ALL2ALL_UTILS_NEW_SIG),
+            (
+                "FlashInferNVLinkOneSided builder",
+                ALL2ALL_UTILS_OLD_BUILD,
+                ALL2ALL_UTILS_NEW_BUILD,
+            ),
+        ],
+    ),
+    (
+        MXFP4_TARGET,
+        # Note: bare "TrtLlmMxfp4ExpertsBase" appears in a comment in the
+        # base file. Anchor on the assignment we introduce instead.
+        "defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)",
+        [
+            ("make_mxfp4_moe_kernel defer_input_quant detection", MXFP4_OLD, MXFP4_NEW),
+            ("make_mxfp4_moe_kernel pass-through", MXFP4_OLD_CALL, MXFP4_NEW_CALL),
+        ],
+    ),
+    (
+        PREP_TARGET,
+        "dispatch_dtype_bytes_per_elem",
+        [
+            ("FlashInferNVLinkOneSided __init__ signature", PREP_OLD_INIT, PREP_NEW_INIT),
+            ("FlashInferNVLinkOneSided initialize call", PREP_OLD_CALL, PREP_NEW_CALL),
+            ("FlashInferNVLinkOneSided prepare quant branch", PREP_OLD_QUANT, PREP_NEW_QUANT),
+        ],
+    ),
+]
+
+
+def patch_file(target: Path, marker: str, patches: list[tuple[str, str, str]]) -> bool:
+    if not target.exists():
+        print(f"[nvlink-bf16-patch] Target not found: {target}", file=sys.stderr)
+        return False
+
+    content = target.read_text()
+    if marker in content:
+        print(f"[nvlink-bf16-patch] {target.name}: already patched, skipping.", file=sys.stderr)
+        return True
+
+    new_content = content
+    for name, old, new in patches:
+        count = new_content.count(old)
+        if count == 0:
+            print(
+                f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} not found. "
+                "vLLM version may have drifted.",
+                file=sys.stderr,
+            )
+            return False
+        if count > 1:
+            print(
+                f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} is ambiguous "
+                f"({count} matches); refusing to patch.",
+                file=sys.stderr,
+            )
+            return False
+        new_content = new_content.replace(old, new, 1)
+        print(f"[nvlink-bf16-patch] {target.name}: patched {name}", file=sys.stderr)
+
+    target.write_text(new_content)
+    return True
+
+
+def main():
+    failures = 0
+    for target, marker, patches in FILES:
+        if not patch_file(target, marker, patches):
+            failures += 1
+
+    if failures:
+        print(f"[nvlink-bf16-patch] {failures} file(s) failed to patch", file=sys.stderr)
+        sys.exit(1)
+    print("[nvlink-bf16-patch] Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py b/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py
new file mode 100644
index 00000000..ce63d88a
--- /dev/null
+++ b/configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py
@@ -0,0 +1,382 @@
+"""
+Patch vLLM v0.20.0 to backport vllm-project/vllm#40960 — bf16 activation
+support for the FlashInfer NVLink one-sided MoE all-to-all path.
+
+Without the patch, FlashInferNVLinkOneSidedPrepareAndFinalize hard-codes the
+dispatch payload to nvfp4 (0.5 B/elem hidden + per-16-elem fp8 scales). That
+crashes for experts that prefer to receive bf16 tokens and quantize
+post-dispatch (e.g. trtllm_mxfp4_moe with mxfp8 activations) and for any
+non-nvfp4 quant_dtype.
+
+Affected files (from PR diff):
+  - vllm/distributed/device_communicators/all2all.py
+  - vllm/model_executor/layers/fused_moe/all2all_utils.py
+  - vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
+  - vllm/model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py
+
+Reference: https://github.com/vllm-project/vllm/pull/40960
+Target: vLLM v0.20.0
+
+Additional local change (NOT in PR 40960): remove the early guard in
+maybe_make_prepare_finalize that raises ValueError for any non-nvfp4
+quant_dtype on the flashinfer_nvlink_one_sided backend. The PR 40960 diff
+does not touch this guard, so without removing it the new bf16/defer-quant
+path is unreachable on v0.20.0. Reviewers on the PR flagged the same gap.
+"""
+
+import sys
+from pathlib import Path
+
+VLLM_ROOT = Path("/usr/local/lib/python3.12/dist-packages/vllm")
+
+# --- File 1: distributed/device_communicators/all2all.py ----------------------
+
+ALL2ALL_TARGET = VLLM_ROOT / "distributed/device_communicators/all2all.py"
+
+ALL2ALL_OLD = (
+    "        top_k: int,\n"
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "    ):\n"
+    '        """Initialize the MoeAlltoAll workspace."""\n'
+    "        if self.initialized:\n"
+    "            return\n"
+)
+
+ALL2ALL_NEW = (
+    "        top_k: int,\n"
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        dispatch_dtype_bytes_per_elem: int = 0,\n"
+    "        dispatch_has_fp8_scale: bool = True,\n"
+    "    ):\n"
+    '        """Initialize the MoeAlltoAll workspace.\n'
+    "\n"
+    "        dispatch_dtype_bytes_per_elem: bytes/elem for the dispatched hidden\n"
+    "            states. Use 0 as a sentinel for sub-byte nvfp4 (0.5 B/elem); use\n"
+    "            1 for fp8, 2 for bf16/fp16.\n"
+    "        dispatch_has_fp8_scale: whether a per-16-elem fp8 scale tensor is\n"
+    "            dispatched alongside the hidden states (true for nvfp4/fp8,\n"
+    "            false for bf16 passthrough).\n"
+    '        """\n'
+    "        if self.initialized:\n"
+    "            return\n"
+)
+
+ALL2ALL_OLD_PAYLOAD = (
+    "        total_dispatch_payload_size_per_token = (\n"
+    "            hidden_size // 2  # nvfp4 hidden states\n"
+    "            + hidden_size // 16  # fp8 scaling factors\n"
+    "            + top_k * 4  # int32 topks ids\n"
+    "            + top_k * 4  # float32 topk weights\n"
+    "        )\n"
+)
+
+ALL2ALL_NEW_PAYLOAD = (
+    "        if dispatch_dtype_bytes_per_elem == 0:\n"
+    "            hidden_bytes = hidden_size // 2  # nvfp4\n"
+    "        else:\n"
+    "            hidden_bytes = hidden_size * dispatch_dtype_bytes_per_elem\n"
+    "        scale_bytes = hidden_size // 16 if dispatch_has_fp8_scale else 0\n"
+    "        total_dispatch_payload_size_per_token = (\n"
+    "            hidden_bytes\n"
+    "            + scale_bytes\n"
+    "            + top_k * 4  # int32 topks ids\n"
+    "            + top_k * 4  # float32 topk weights\n"
+    "        )\n"
+)
+
+# --- File 2: fused_moe/all2all_utils.py ---------------------------------------
+
+ALL2ALL_UTILS_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/all2all_utils.py"
+
+ALL2ALL_UTILS_OLD_SIG = (
+    "    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n"
+    "    allow_new_interface: bool = False,\n"
+    "    use_monolithic: bool = False,\n"
+    ") -> FusedMoEPrepareAndFinalize | None:\n"
+)
+
+ALL2ALL_UTILS_NEW_SIG = (
+    "    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,\n"
+    "    allow_new_interface: bool = False,\n"
+    "    use_monolithic: bool = False,\n"
+    "    defer_input_quant: bool = False,\n"
+    ") -> FusedMoEPrepareAndFinalize | None:\n"
+)
+
+# Local change: drop the nvfp4-only guard so the bf16/defer_input_quant path
+# in the patched builder below is reachable. Not part of PR 40960.
+ALL2ALL_UTILS_OLD_GUARD = (
+    "    elif moe.use_fi_nvl_one_sided_kernels:\n"
+    "        assert quant_config is not None\n"
+    '        if quant_config.quant_dtype != "nvfp4":\n'
+    "            raise ValueError(\n"
+    "                \"The 'flashinfer_nvlink_one_sided' all2all backend only \"\n"
+    '                "supports nvfp4 activation quantization, but got "\n'
+    '                f"quant_dtype={quant_config.quant_dtype!r}. Use a different "\n'
+    "                \"all2all backend (e.g. 'flashinfer_nvlink_two_sided' or \"\n"
+    "                \"'allgather_reducescatter') for non-nvfp4 models.\"\n"
+    "            )\n"
+    "        max_num_tokens = (\n"
+    "            get_current_vllm_config().scheduler_config.max_num_batched_tokens\n"
+    "        )\n"
+)
+
+ALL2ALL_UTILS_NEW_GUARD = (
+    "    elif moe.use_fi_nvl_one_sided_kernels:\n"
+    "        assert quant_config is not None\n"
+    "        max_num_tokens = (\n"
+    "            get_current_vllm_config().scheduler_config.max_num_batched_tokens\n"
+    "        )\n"
+)
+
+ALL2ALL_UTILS_OLD_BUILD = (
+    "        prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n"
+    "            max_num_tokens=max_num_tokens,\n"
+    "            top_k=moe.experts_per_token,\n"
+    "            num_experts=moe.num_experts,\n"
+    "            hidden_size=moe.hidden_dim,\n"
+    "            num_dispatchers=all2all_manager.world_size,\n"
+    "        )\n"
+)
+
+ALL2ALL_UTILS_NEW_BUILD = (
+    "        if defer_input_quant or quant_config.quant_dtype is None:\n"
+    "            # Experts (e.g. trtllm_mxfp4 with mxfp8 activations) quantize\n"
+    "            # post-dispatch; ship bf16 tokens with no per-token scale payload.\n"
+    "            dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 2, False\n"
+    '        elif quant_config.quant_dtype == "nvfp4":\n'
+    "            dispatch_dtype_bytes_per_elem, dispatch_has_fp8_scale = 0, True\n"
+    "        else:\n"
+    "            raise NotImplementedError(\n"
+    '                "flashinfer_nvlink_one_sided dispatch only supports nvfp4, "\n'
+    '                "bf16, and defer_input_quant paths today; got "\n'
+    '                f"quant_dtype={quant_config.quant_dtype!r}"\n'
+    "            )\n"
+    "        prepare_finalize = FlashInferNVLinkOneSidedPrepareAndFinalize(\n"
+    "            max_num_tokens=max_num_tokens,\n"
+    "            top_k=moe.experts_per_token,\n"
+    "            num_experts=moe.num_experts,\n"
+    "            hidden_size=moe.hidden_dim,\n"
+    "            num_dispatchers=all2all_manager.world_size,\n"
+    "            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n"
+    "            dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n"
+    "        )\n"
+)
+
+# --- File 3: fused_moe/oracle/mxfp4.py ----------------------------------------
+
+MXFP4_TARGET = VLLM_ROOT / "model_executor/layers/fused_moe/oracle/mxfp4.py"
+
+MXFP4_OLD = (
+    '    """Create a FusedMoEKernel for the given MXFP4 backend."""\n'
+    "    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n"
+    "\n"
+    "    # Create Prepare/Finalize.\n"
+    "    prepare_finalize = maybe_make_prepare_finalize(\n"
+    "        moe=moe_config,\n"
+)
+
+MXFP4_NEW = (
+    '    """Create a FusedMoEKernel for the given MXFP4 backend."""\n'
+    "    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)\n"
+    "\n"
+    "    # Some experts (trtllm_mxfp4 with mxfp8 activations) prefer bf16 tokens\n"
+    "    # on dispatch and quantize internally; signal this to the prepare/finalize\n"
+    "    # so workspace + prepare path ship bf16 instead of the quant_config dtype.\n"
+    "    from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (\n"
+    "        TrtLlmMxfp4ExpertsBase,\n"
+    "    )\n"
+    "\n"
+    "    defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)\n"
+    "\n"
+    "    # Create Prepare/Finalize.\n"
+    "    prepare_finalize = maybe_make_prepare_finalize(\n"
+    "        moe=moe_config,\n"
+)
+
+MXFP4_OLD_CALL = (
+    "        routing_tables=routing_tables,\n"
+    "        allow_new_interface=True,\n"
+    "        use_monolithic=is_monolithic,\n"
+    "    )\n"
+    "    assert prepare_finalize is not None\n"
+)
+
+MXFP4_NEW_CALL = (
+    "        routing_tables=routing_tables,\n"
+    "        allow_new_interface=True,\n"
+    "        use_monolithic=is_monolithic,\n"
+    "        defer_input_quant=defer_input_quant,\n"
+    "    )\n"
+    "    assert prepare_finalize is not None\n"
+)
+
+# --- File 4: fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py --------
+
+PREP_TARGET = VLLM_ROOT / (
+    "model_executor/layers/fused_moe/prepare_finalize/flashinfer_nvlink_one_sided.py"
+)
+
+PREP_OLD_INIT = (
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        num_dispatchers: int = 1,\n"
+    "    ):\n"
+    "        super().__init__()\n"
+)
+
+PREP_NEW_INIT = (
+    "        num_experts: int,\n"
+    "        hidden_size: int,\n"
+    "        num_dispatchers: int = 1,\n"
+    "        dispatch_dtype_bytes_per_elem: int = 0,\n"
+    "        dispatch_has_fp8_scale: bool = True,\n"
+    "    ):\n"
+    "        super().__init__()\n"
+)
+
+PREP_OLD_CALL = (
+    "            top_k=self.top_k,\n"
+    "            num_experts=self.num_experts,\n"
+    "            hidden_size=self.hidden_size,\n"
+    "        )\n"
+)
+
+PREP_NEW_CALL = (
+    "            top_k=self.top_k,\n"
+    "            num_experts=self.num_experts,\n"
+    "            hidden_size=self.hidden_size,\n"
+    "            dispatch_dtype_bytes_per_elem=dispatch_dtype_bytes_per_elem,\n"
+    "            dispatch_has_fp8_scale=dispatch_has_fp8_scale,\n"
+    "        )\n"
+)
+
+PREP_OLD_QUANT = (
+    "        a1q, a1q_scale = moe_kernel_quantize_input(\n"
+    "            a1,\n"
+    "            quant_config.a1_gscale,\n"
+    "            quant_config.quant_dtype,\n"
+    "            quant_config.per_act_token_quant,\n"
+    "            quant_config.block_shape,\n"
+    "            is_fp4_scale_swizzled=False,  # delay swizzle to after comm\n"
+    "        )\n"
+)
+
+PREP_NEW_QUANT = (
+    "        if defer_input_quant:\n"
+    "            # Experts (e.g. trtllm_mxfp4_moe with mxfp8 activations) will\n"
+    "            # quantize post-dispatch. Ship bf16 tokens and skip scales.\n"
+    "            a1q, a1q_scale = a1, None\n"
+    "        else:\n"
+    "            a1q, a1q_scale = moe_kernel_quantize_input(\n"
+    "                a1,\n"
+    "                quant_config.a1_gscale,\n"
+    "                quant_config.quant_dtype,\n"
+    "                quant_config.per_act_token_quant,\n"
+    "                quant_config.block_shape,\n"
+    "                is_fp4_scale_swizzled=False,  # delay swizzle to after comm\n"
+    "            )\n"
+)
+
+# (target file, marker indicating already-patched, [(name, old, new), ...])
+FILES = [
+    (
+        ALL2ALL_TARGET,
+        "dispatch_dtype_bytes_per_elem",
+        [
+            ("MoeAlltoAll.initialize signature", ALL2ALL_OLD, ALL2ALL_NEW),
+            ("MoeAlltoAll dispatch payload sizing", ALL2ALL_OLD_PAYLOAD, ALL2ALL_NEW_PAYLOAD),
+        ],
+    ),
+    (
+        ALL2ALL_UTILS_TARGET,
+        # Anchor on a string this patch introduces — namely the new parameter
+        # in maybe_make_prepare_finalize's signature. Bare "defer_input_quant"
+        # appears in unrelated comments in the base file, so use the full
+        # declaration form for uniqueness.
+        "defer_input_quant: bool = False,",
+        [
+            ("maybe_make_prepare_finalize signature", ALL2ALL_UTILS_OLD_SIG, ALL2ALL_UTILS_NEW_SIG),
+            (
+                "remove nvfp4-only guard (local, not from PR 40960)",
+                ALL2ALL_UTILS_OLD_GUARD,
+                ALL2ALL_UTILS_NEW_GUARD,
+            ),
+            (
+                "FlashInferNVLinkOneSided builder",
+                ALL2ALL_UTILS_OLD_BUILD,
+                ALL2ALL_UTILS_NEW_BUILD,
+            ),
+        ],
+    ),
+    (
+        MXFP4_TARGET,
+        # Anchor on the assignment introduced by this patch.
+        "defer_input_quant = issubclass(experts_cls, TrtLlmMxfp4ExpertsBase)",
+        [
+            ("make_mxfp4_moe_kernel defer_input_quant detection", MXFP4_OLD, MXFP4_NEW),
+            ("make_mxfp4_moe_kernel pass-through", MXFP4_OLD_CALL, MXFP4_NEW_CALL),
+        ],
+    ),
+    (
+        PREP_TARGET,
+        "dispatch_dtype_bytes_per_elem",
+        [
+            ("FlashInferNVLinkOneSided __init__ signature", PREP_OLD_INIT, PREP_NEW_INIT),
+            ("FlashInferNVLinkOneSided initialize call", PREP_OLD_CALL, PREP_NEW_CALL),
+            ("FlashInferNVLinkOneSided prepare quant branch", PREP_OLD_QUANT, PREP_NEW_QUANT),
+        ],
+    ),
+]
+
+
+def patch_file(target: Path, marker: str, patches: list[tuple[str, str, str]]) -> bool:
+    if not target.exists():
+        print(f"[nvlink-bf16-patch] Target not found: {target}", file=sys.stderr)
+        return False
+
+    content = target.read_text()
+    if marker in content:
+        print(f"[nvlink-bf16-patch] {target.name}: already patched, skipping.", file=sys.stderr)
+        return True
+
+    new_content = content
+    for name, old, new in patches:
+        count = new_content.count(old)
+        if count == 0:
+            print(
+                f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} not found. "
+                "vLLM version may have drifted.",
+                file=sys.stderr,
+            )
+            return False
+        if count > 1:
+            print(
+                f"[nvlink-bf16-patch] {target.name}: anchor for {name!r} is ambiguous "
+                f"({count} matches); refusing to patch.",
+                file=sys.stderr,
+            )
+            return False
+        new_content = new_content.replace(old, new, 1)
+        print(f"[nvlink-bf16-patch] {target.name}: patched {name}", file=sys.stderr)
+
+    target.write_text(new_content)
+    return True
+
+
+def main():
+    failures = 0
+    for target, marker, patches in FILES:
+        if not patch_file(target, marker, patches):
+            failures += 1
+
+    if failures:
+        print(f"[nvlink-bf16-patch] {failures} file(s) failed to patch", file=sys.stderr)
+        sys.exit(1)
+    print("[nvlink-bf16-patch] Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configs/prefetch-ai-dynamo-wheel.sh b/configs/prefetch-ai-dynamo-wheel.sh
new file mode 100755
index 00000000..2a770493
--- /dev/null
+++ b/configs/prefetch-ai-dynamo-wheel.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+DYNAMO_VERSION="${DYNAMO_VERSION:-}"
+
+if [ -z "${DYNAMO_VERSION}" ]; then
+    echo "ERROR: DYNAMO_VERSION must be set for ai-dynamo wheel prefetch" >&2
+    exit 1
+fi
+
+DYNAMO_PACKAGE="ai-dynamo==${DYNAMO_VERSION}"
+DYNAMO_RUNTIME_PACKAGE="ai-dynamo-runtime==${DYNAMO_VERSION}"
+DYNAMO_WHEEL_NAME="${DYNAMO_WHEEL_NAME:-ai_dynamo-${DYNAMO_VERSION}-py3-none-any.whl}"
+DYNAMO_RUNTIME_WHEEL_PATTERN="${DYNAMO_RUNTIME_WHEEL_PATTERN:-ai_dynamo_runtime-${DYNAMO_VERSION}-*.whl}"
+DYNAMO_INDEX_URL="${DYNAMO_INDEX_URL:-https://pypi.org/simple}"
+DYNAMO_EXTRA_INDEX_URL="${DYNAMO_EXTRA_INDEX_URL:-https://pypi.nvidia.com}"
+
+source_dir="${SRTCTL_SOURCE_DIR:-$(pwd)}"
+wheel_dir="${DYNAMO_WHEEL_HOST_DIR:-${source_dir}/configs/wheels}"
+wheel_path="${wheel_dir}/${DYNAMO_WHEEL_NAME}"
+lock_path="${wheel_dir}/.${DYNAMO_WHEEL_NAME}.lock"
+
+mkdir -p "${wheel_dir}"
+
+runtime_wheel_path() {
+    find "${wheel_dir}" -maxdepth 1 -type f -name "${DYNAMO_RUNTIME_WHEEL_PATTERN}" -print -quit
+}
+
+python_with_pip() {
+    if python3 -m pip --version >/dev/null 2>&1; then
+        command -v python3
+        return
+    fi
+
+    if ! command -v uv >/dev/null 2>&1; then
+        echo "ERROR: python3 does not provide pip, and uv is unavailable to create a pip-seeded prefetch venv" >&2
+        return 1
+    fi
+
+    local prefetch_venv="${DYNAMO_PREFETCH_VENV:-${wheel_dir}/.prefetch-venv}"
+    uv venv --seed "${prefetch_venv}" >/dev/null
+    echo "${prefetch_venv}/bin/python"
+}
+
+if [ -f "${wheel_path}" ] && [ -n "$(runtime_wheel_path)" ]; then
+    echo "ai-dynamo wheels already staged: ${wheel_dir}"
+    exit 0
+fi
+
+download_wheels() {
+    local python_bin
+    python_bin="$(python_with_pip)"
+    "${python_bin}" -m pip download \
+        --no-deps \
+        --pre \
+        --only-binary=:all: \
+        --index-url "${DYNAMO_INDEX_URL}" \
+        --extra-index-url "${DYNAMO_EXTRA_INDEX_URL}" \
+        --dest "${wheel_dir}" \
+        "${DYNAMO_RUNTIME_PACKAGE}" \
+        "${DYNAMO_PACKAGE}"
+}
+
+if command -v flock >/dev/null 2>&1; then
+    (
+        flock -x 9
+        if [ ! -f "${wheel_path}" ] || [ -z "$(runtime_wheel_path)" ]; then
+            echo "Staging ai-dynamo wheels: ${DYNAMO_RUNTIME_PACKAGE} ${DYNAMO_PACKAGE} -> ${wheel_dir}"
+            download_wheels
+        fi
+    ) 9>"${lock_path}"
+else
+    echo "Staging ai-dynamo wheels: ${DYNAMO_RUNTIME_PACKAGE} ${DYNAMO_PACKAGE} -> ${wheel_dir}"
+    download_wheels
+fi
+
+if [ ! -f "${wheel_path}" ]; then
+    echo "ERROR: expected ${wheel_path} after download" >&2
+    exit 1
+fi
+
+if [ -z "$(runtime_wheel_path)" ]; then
+    echo "ERROR: expected ${DYNAMO_RUNTIME_WHEEL_PATTERN} in ${wheel_dir} after download" >&2
+    exit 1
+fi
diff --git a/configs/vllm-container-deps-one-sided.sh b/configs/vllm-container-deps-one-sided.sh
new file mode 100644
index 00000000..d716e84f
--- /dev/null
+++ b/configs/vllm-container-deps-one-sided.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apt-get -y update && apt-get install -y --no-install-recommends --allow-change-held-packages numactl
+
+pip install msgpack
+
+if [ -n "${DYNAMO_VERSION:-}" ] || [ -n "${DYNAMO_WHEEL_NAME:-}" ]; then
+    if [ -f /configs/install-ai-dynamo.sh ]; then
+        bash /configs/install-ai-dynamo.sh
+    else
+        echo "ERROR: /configs/install-ai-dynamo.sh not found for ai-dynamo wheel install" >&2
+        exit 1
+    fi
+fi
+
+# Upgrade FlashInfer for the NVLink one-sided all-to-all bf16 dispatch patch.
+# flashinfer-python / flashinfer-cubin publish on PyPI; flashinfer-jit-cache is
+# CUDA-specific and only on the cu130 index. --index-url replaces PyPI entirely,
+# so split into two calls.
+pip install --upgrade flashinfer-python==0.6.9 flashinfer-cubin==0.6.9
+pip install --upgrade flashinfer-jit-cache==0.6.9 --index-url https://flashinfer.ai/whl/cu130
+
+if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
+    python3 /configs/patches/vllm_numa_bind_hash_fix.py
+fi
+
+if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix.py ]; then
+    python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix.py
+fi
diff --git a/configs/vllm-container-deps.sh b/configs/vllm-container-deps.sh
index 43807255..1d48d023 100644
--- a/configs/vllm-container-deps.sh
+++ b/configs/vllm-container-deps.sh
@@ -2,4 +2,26 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-pip install msgpack
\ No newline at end of file
+pip install msgpack
+
+if [ -n "${DYNAMO_VERSION:-}" ] || [ -n "${DYNAMO_WHEEL_NAME:-}" ]; then
+    if [ -f /configs/install-ai-dynamo.sh ]; then
+        bash /configs/install-ai-dynamo.sh
+    else
+        echo "ERROR: /configs/install-ai-dynamo.sh not found for ai-dynamo wheel install" >&2
+        exit 1
+    fi
+fi
+
+if [ -f /configs/patches/vllm_numa_bind_hash_fix.py ]; then
+    python3 /configs/patches/vllm_numa_bind_hash_fix.py
+fi
+
+if [ -f /configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py ]; then
+    python3 /configs/patches/vllm_nvlink_one_sided_bf16_fix_v20.py
+fi
+
+if [ -f /configs/patches/vllm_cumem_expandable_segments_fix.py ]; then
+    python3 /configs/patches/vllm_cumem_expandable_segments_fix.py
+fi
+
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml
new file mode 100644
index 00000000..0dea47db
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p17d-tep4-tp4.yaml
@@ -0,0 +1,93 @@
+name: "vllm-disagg-gb300-1p17d-tep4-tp4"
+model:
+  path: "dsv4_pro"
+  container: "vllm/vllm-openai:v0.20.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 17
+  prefill_workers: 1
+  decode_workers: 17
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "18x36x72"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml
new file mode 100644
index 00000000..c1d324ca
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-1p6d-dep4-tp4.yaml
@@ -0,0 +1,99 @@
+name: "vllm-disagg-gb200-1p6d-dep4-tp4"
+model:
+  path: "dsv4_pro"
+  container: "vllm/vllm-openai:v0.20.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 6
+  prefill_workers: 1
+  decode_workers: 6
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "192x256"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml
new file mode 100644
index 00000000..83e78d46
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-4p1d-dep4-dep8-24-c4096.yaml
@@ -0,0 +1,108 @@
+name: "dsv4-vllm-disagg-gb300-4p1d-dep4-dep8"
+
+model:
+  path: "dsv4-pro"
+  container: "vllm/vllm-openai:v0.20.0-aarch64-cu130"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 4
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml
new file mode 100644
index 00000000..119fafcf
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-5p1d-dep4-dep8-28-c4096.yaml
@@ -0,0 +1,108 @@
+name: "dsv4-vllm-disagg-gb300-5p1d-dep4-dep8"
+
+model:
+  path: "dsv4-pro"
+  container: "vllm/vllm-openai:v0.20.0-aarch64-cu130"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 5
+  decode_nodes: 2
+  prefill_workers: 5
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml
new file mode 100644
index 00000000..03c72fa3
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-6p1d-dep4-dep8-32-c4096.yaml
@@ -0,0 +1,108 @@
+name: "dsv4-vllm-disagg-gb300-6p1d-dep4-dep8"
+
+model:
+  path: "dsv4-pro"
+  container: "vllm/vllm-openai:v0.20.0-aarch64-cu130"
+  precision: "fp4"
+
+dynamo:
+  wheel: "1.2.0.dev20260426"
+  install: true
+
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 6
+  decode_workers: 1
+  gpus_per_prefill: 4
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_LOG_STATS_INTERVAL: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      safetensors-load-strategy: "prefetch"
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      no-async-scheduling: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      enable-sleep-mode: true
+      moe-backend: "deep_gemm_mega_moe"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml
new file mode 100644
index 00000000..5278efc0
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa-gb300-8k1k/disagg-gb300-7p2d-dep4-dep16.yaml
@@ -0,0 +1,104 @@
+name: "vllm-disagg-gb200-7p2d-dep4-dep16"
+model:
+  path: "dsv4_pro"
+  container: "vllm/vllm-openai:v0.20.0-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 7
+  decode_nodes: 8
+  prefill_workers: 7
+  decode_workers: 2
+  gpus_per_prefill: 4
+  gpus_per_decode: 16
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 4
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      enable-ep-weight-filter: true
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      attention-config: '{"use_fp4_indexer_cache": true}'
+      moe-backend: "deep_gemm_mega_moe"
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      no-enable-flashinfer-autotune: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048x3072"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
new file mode 100644
index 00000000..ed2004e5
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
@@ -0,0 +1,118 @@
+name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+  install: false
+
+setup_script: vllm-container-deps.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256x512x1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
new file mode 100644
index 00000000..a29d51a0
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
@@ -0,0 +1,119 @@
+name: "svf-vllm-disagg-gb200-1p4d-dep8-tp8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+#  version: 1.0.2
+  install: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml
new file mode 100644
index 00000000..d79b78f4
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml
@@ -0,0 +1,119 @@
+name: "svf-vllm-disagg-gb200-1p8d-dep8-tp8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+#  version: 1.0.2
+  install: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 16
+  prefill_workers: 1
+  decode_workers: 8
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64x128x256x512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
new file mode 100644
index 00000000..2a175a5a
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
@@ -0,0 +1,115 @@
+name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+#  version: 1.0.2
+  install: false
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
new file mode 100644
index 00000000..fd3f4f36
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
@@ -0,0 +1,115 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+#  version: 1.0.2
+  install: false
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
new file mode 100644
index 00000000..417d8476
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro-sa/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
@@ -0,0 +1,115 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  hash: 6a159fedd8e4a1563aa647c31f622aedbf254b5b
+#  version: 1.0.2
+  install: false
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    # VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    # VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
new file mode 100644
index 00000000..83b5e6a4
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-dep8-16-c256-c512-c1024-offload.yaml
@@ -0,0 +1,118 @@
+name: "svf-vllm-disagg-gb200-1p1d-dep8-dep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
new file mode 100644
index 00000000..5943c41c
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml
@@ -0,0 +1,119 @@
+name: "svf-vllm-disagg-gb200-1p1d-dep8-tep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      attention-config: '{"use_fp4_indexer_cache":true}'
+      compilation-config: '{"mode":0,"cudagraph_mode":"FULL_DECODE_ONLY","pass_config":{"fuse_allreduce_rms":false}}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x2x4x8x16x32x64"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
new file mode 100644
index 00000000..e35f9f62
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p4d-dep8-tp8-c256-c512-offload.yaml
@@ -0,0 +1,118 @@
+name: "svf-vllm-disagg-gb200-1p4d-dep8-tp8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml
new file mode 100644
index 00000000..dc2401f8
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-1p8d-dep8-tp8-c8-c16-c32-c64-c128-c256-offload.yaml
@@ -0,0 +1,118 @@
+name: "svf-vllm-disagg-gb200-1p8d-dep8-tp8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 16
+  prefill_workers: 1
+  decode_workers: 8
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+#      data-parallel-size: 8
+#      data-parallel-rpc-port: 13345
+#      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8x16x32x64x128x256x512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml
new file mode 100644
index 00000000..ef6ad86e
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c256-c512-c1024.yaml
@@ -0,0 +1,110 @@
+name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps-one-sided.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-ep-weight-filter: true
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
new file mode 100644
index 00000000..e7fa4a49
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-2p1d-dep8-dep8-c4096-offload.yaml
@@ -0,0 +1,114 @@
+name: "svf-vllm-disagg-gb200-2p1d-dep8-dep8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml
new file mode 100644
index 00000000..0b456f83
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-40-c2048-c4096-offload.yaml
@@ -0,0 +1,118 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 9280
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 9280
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "2048x4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
new file mode 100644
index 00000000..043cd370
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep16-c4096-offload.yaml
@@ -0,0 +1,114 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep16-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 4
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml
new file mode 100644
index 00000000..df713575
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c2048.yaml
@@ -0,0 +1,109 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps-one-sided.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+      all2all-backend: "flashinfer_nvlink_one_sided"
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
new file mode 100644
index 00000000..e39ea0df
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-3p1d-dep8-dep8-c4096-offload.yaml
@@ -0,0 +1,114 @@
+name: "svf-vllm-disagg-gb200-3p1d-dep8-dep8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai:deepseekv4-cu130"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 6
+  decode_nodes: 2
+  prefill_workers: 3
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml
new file mode 100644
index 00000000..87475266
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb200-7p1d-dep8-dep16-c4096-c8192.yaml
@@ -0,0 +1,109 @@
+name: "svf-vllm-disagg-gb200-7p1d-dep8-dep16"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+
+setup_script: vllm-container-deps-one-sided.sh
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  prefill_nodes: 14
+  decode_nodes: 4
+  prefill_workers: 7
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  decode_environment:
+    VLLM_ENGINE_READY_TIMEOUT_S: "3600"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_MOE_ROUTING_SIMULATION_STRATEGY: "uniform_random"
+    VLLM_LOG_STATS_INTERVAL: "1"
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      tokenizer-mode: deepseek_v4
+      enable-ep-weight-filter: true
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 512
+      max-cudagraph-capture-size: 512
+      max-num-batched-tokens: 512
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      tokenizer-mode: deepseek_v4
+      all2all-backend: "flashinfer_nvlink_one_sided"
+      enable-ep-weight-filter: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512x1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml
new file mode 100644
index 00000000..b8f7592d
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-dep8-tp4-c256-c512.yaml
@@ -0,0 +1,111 @@
+name: "svf-vllm-disagg-gb300-1p12d-dep8-tp4"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 12
+  prefill_workers: 1
+  decode_workers: 12
+  gpus_per_prefill: 8
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 1024
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "256x512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml
new file mode 100644
index 00000000..88571eb7
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p12d-tp4-tp4-c32-c128.yaml
@@ -0,0 +1,96 @@
+name: "svf-vllm-disagg-gb300-1p12d-tp4-tp4"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 12
+  prefill_workers: 1
+  decode_workers: 12
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 10240 #auto
+      max-num-seqs: 128
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 1024
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x128"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml
new file mode 100644
index 00000000..eff5968d
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-dep8-tep8-c1024.yaml
@@ -0,0 +1,112 @@
+name: "svf-vllm-disagg-gb300-1p4d-dep8-tep8"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 8
+      pipeline-parallel-size: 1
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 4096
+      max-cudagraph-capture-size: 4096
+      max-num-batched-tokens: 4096
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1024"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml
new file mode 100644
index 00000000..df12dd7a
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p4d-tp4-tp4-c128.yaml
@@ -0,0 +1,89 @@
+name: "svf-vllm-disagg-gb300-1p4d-tp4-tp4"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 10240
+      max-num-seqs: 128
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 10240
+      max-num-seqs: 64
+      max-cudagraph-capture-size: 64
+      max-num-batched-tokens: 64
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "128"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml
new file mode 100644
index 00000000..2928a199
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-dep8-tp4-c512.yaml
@@ -0,0 +1,111 @@
+name: "svf-vllm-disagg-gb300-1p8d-dep8-tp4"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 2
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 8
+  gpus_per_prefill: 8
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 64
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 1024
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml
new file mode 100644
index 00000000..3c2f1b9f
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-1p8d-tp4-tp4-c8-c16-c128.yaml
@@ -0,0 +1,96 @@
+name: "svf-vllm-disagg-gb300-1p8d-tp4-tp4"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 1
+  decode_nodes: 8
+  prefill_workers: 1
+  decode_workers: 8
+  gpus_per_prefill: 4
+  gpus_per_decode: 4
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      enforce-eager: true
+      max-model-len: 10240
+      max-num-seqs: 128
+      max-num-batched-tokens: 16384
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.9
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 4
+      pipeline-parallel-size: 1
+      max-model-len: 16384
+      max-num-seqs: 1024
+      max-cudagraph-capture-size: 1024
+      max-num-batched-tokens: 1024
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8x16x128"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"   
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml
new file mode 100644
index 00000000..2aa2464e
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep16-offload-c512-c4096.yaml
@@ -0,0 +1,109 @@
+name: "svf-vllm-disagg-gb300-2p1d-dep8-dep16-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 4
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 16
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 16
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 128
+      max-cudagraph-capture-size: 128
+      max-num-batched-tokens: 128
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "512x4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml
new file mode 100644
index 00000000..21b3d196
--- /dev/null
+++ b/recipes/vllm/deepseek-v4-pro/8k1k/disagg-gb300-2p1d-dep8-dep8-offload-c4096.yaml
@@ -0,0 +1,109 @@
+name: "svf-vllm-disagg-gb300-2p1d-dep8-dep8-offload"
+model:
+  path: "deepseekv4-fp4"
+  container: "vllm/vllm-openai@sha256:d29a90b13bb9758821839dd810db9679055e8adf7c670df9f0a432f45f2488a5"
+  precision: "fp4"
+
+dynamo:
+  install: true
+  wheel: "1.2.0.dev20260426"
+setup_script: vllm-container-deps.sh
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  prefill_nodes: 4
+  decode_nodes: 2
+  prefill_workers: 2
+  decode_workers: 1
+  gpus_per_prefill: 8
+  gpus_per_decode: 8
+frontend:
+  type: dynamo
+  enable_multiple_frontends: false
+backend:
+  type: vllm
+  connector: null
+  prefill_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  decode_environment:
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    VLLM_SERVER_DEV_MODE: "1"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    NCCL_P2P_LEVEL: NVL
+  vllm_config:
+    prefill:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      enforce-eager: true
+      max-model-len: 16384
+      max-num-seqs: 16
+      max-num-batched-tokens: 32768
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      no-enable-flashinfer-autotune: true
+      no-async-scheduling: true
+      block-size: 256
+      gpu-memory-utilization: 0.8
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      numa-bind: true
+      offload-group-size: 3
+      offload-num-in-group: 1
+      offload-prefetch-step: 2
+      # offload-params: "w13_weight w2_weight w13_weight_scale w2_weight_scale wq_b wo_a wo_b shared_experts"
+      tokenizer-mode: deepseek_v4
+    decode:
+      kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      kv-cache-dtype: "fp8"
+      tensor-parallel-size: 1
+      pipeline-parallel-size: 1
+      data-parallel-size: 8
+      data-parallel-rpc-port: 13345
+      enable-expert-parallel: true
+      max-model-len: 16384
+      max-num-seqs: 256
+      max-cudagraph-capture-size: 256
+      max-num-batched-tokens: 256
+      trust-remote-code: true
+      no-enable-prefix-caching: true
+      block-size: 256
+      compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
+      gpu-memory-utilization: 0.9
+      stream-interval: 50
+      no-disable-hybrid-kv-cache-manager: true
+      enable-sleep-mode: true
+      tokenizer-mode: deepseek_v4
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4096"
+  req_rate: "inf"
+  tokenizer_mode: "deepseek_v4"
+  use_chat_template: true
diff --git a/src/srtctl/benchmarks/sa_bench.py b/src/srtctl/benchmarks/sa_bench.py
index 5f220393..e690cb19 100644
--- a/src/srtctl/benchmarks/sa_bench.py
+++ b/src/srtctl/benchmarks/sa_bench.py
@@ -101,5 +101,6 @@ def build_command(
             str(b.num_warmup_mult) if b.num_warmup_mult is not None else "2",
             b.custom_tokenizer or "",
             str(b.use_chat_template).lower(),
+            b.tokenizer_mode or "auto",
         ]
         return cmd
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
index 87f3f9ef..0014f221 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/backend_request_func.py
@@ -629,10 +629,30 @@ def get_tokenizer(
                 "to use mistral tokenizer mode."
             ) from e
         return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
+    if tokenizer_mode == "deepseek_v4":
+        try:
+            from vllm.tokenizers.deepseek_v4 import DeepseekV4Tokenizer
+        except ImportError as e:
+            raise ImportError(
+                "DeepseekV4Tokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use deepseek_v4 tokenizer mode."
+            ) from e
+        return DeepseekV4Tokenizer.from_pretrained(str(pretrained_model_name_or_path))
 
     if custom_tokenizer:
         if custom_tokenizer == "glm_moe_dsa":
             return _load_glm_moe_dsa_tokenizer(pretrained_model_name_or_path)
+        if custom_tokenizer == "deepseek_v4":
+            try:
+                from vllm.tokenizers.deepseek_v4 import DeepseekV4Tokenizer
+            except ImportError as e:
+                raise ImportError(
+                    "DeepseekV4Tokenizer requires vllm package.\n"
+                    "Please install it with `pip install vllm` "
+                    "to use deepseek_v4 tokenizer."
+                ) from e
+            return DeepseekV4Tokenizer.from_pretrained(str(pretrained_model_name_or_path))
         from importlib import import_module
         try:
             module_path, class_name = custom_tokenizer.rsplit('.', 1)
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
index acddf754..999705e0 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
+++ b/src/srtctl/benchmarks/scripts/sa-bench/bench.sh
@@ -64,6 +64,10 @@ NUM_PROMPTS_MULT=${13:-10}
 NUM_WARMUP_MULT=${14:-2}
 CUSTOM_TOKENIZER=${15:-}
 USE_CHAT_TEMPLATE=${16:-true}
+TOKENIZER_MODE=${17:-auto}
+
+# Build optional tokenizer mode args
+TOKENIZER_MODE_ARGS=(--tokenizer-mode "$TOKENIZER_MODE")
 
 # Build optional custom tokenizer args
 CUSTOM_TOKENIZER_ARGS=()
@@ -136,6 +140,8 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
         --trust-remote-code \
+        "${TOKENIZER_MODE_ARGS[@]}" \
+        "${CHAT_TEMPLATE_ARGS[@]}" \
         "${CUSTOM_TOKENIZER_ARGS[@]}"
 
     num_prompts=$((concurrency * 10))
@@ -166,6 +172,7 @@ for concurrency in "${CONCURRENCY_LIST[@]}"; do
         --percentile-metrics ttft,tpot,itl,e2el \
         --max-concurrency "$concurrency" \
         --trust-remote-code \
+        "${TOKENIZER_MODE_ARGS[@]}" \
         "${CHAT_TEMPLATE_ARGS[@]}" \
         "${CUSTOM_TOKENIZER_ARGS[@]}" \
         --save-result --result-dir "$result_dir" --result-filename "$result_filename"
@@ -179,4 +186,3 @@ done
 stop_all_profiling
 
 echo "SA-Bench complete. Results in $result_dir"
-
diff --git a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
index a5ea6490..952a8b23 100644
--- a/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
+++ b/src/srtctl/benchmarks/scripts/sa-bench/benchmark_serving.py
@@ -1272,11 +1272,12 @@ def main(args: argparse.Namespace):
         "--tokenizer-mode",
         type=str,
         default="auto",
-        choices=["auto", "slow", "mistral", "custom"],
+        choices=["auto", "slow", "mistral", "custom", "deepseek_v4"],
         help='The tokenizer mode.\n\n* "auto" will use the '
         'fast tokenizer if available.\n* "slow" will '
         "always use the slow tokenizer. \n* "
-        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"mistral" will always use the `mistral_common` tokenizer. \n* '
+        '"deepseek_v4" will use vLLM\'s DeepSeek V4 tokenizer. \n* '
         '"custom" will use --tokenizer to select the preregistered tokenizer.',
     )
 
diff --git a/src/srtctl/cli/submit.py b/src/srtctl/cli/submit.py
index 21f26d9f..697f14e6 100644
--- a/src/srtctl/cli/submit.py
+++ b/src/srtctl/cli/submit.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import re
+import shlex
 import shutil
 import subprocess
 import sys
@@ -115,7 +116,8 @@ def show_config_details(config: SrtConfig) -> None:
     console.print(Panel(mounts_table, border_style="green"))
 
     # --- Environment Variables ---
-    has_env = bool(config.environment)
+    dynamo_environment = config.dynamo.get_wheel_environment()
+    has_env = bool(config.environment or dynamo_environment)
     backend = config.backend
     mode_envs: list[tuple[str, dict[str, str]]] = []
     for mode_name, attr in [
@@ -134,6 +136,9 @@ def show_config_details(config: SrtConfig) -> None:
         env_table.add_column("Variable", style="yellow")
         env_table.add_column("Value", style="white")
 
+        for var, val in sorted(dynamo_environment.items()):
+            env_table.add_row("dynamo", var, val)
+
         for var, val in sorted(config.environment.items()):
             env_table.add_row("global", var, val)
 
@@ -207,6 +212,8 @@ def generate_minimal_sbatch_script(
     container_image = os.path.expandvars(config.model.container)
 
     job_name = get_job_name(config)
+    config_environment = config.dynamo.get_wheel_environment()
+    config_environment.update(config.environment)
 
     rendered = template.render(
         job_name=job_name,
@@ -227,6 +234,7 @@ def generate_minimal_sbatch_script(
         srtctl_source=str(srtctl_source.resolve()),
         output_base=output_base,
         setup_script=setup_script,
+        config_environment={key: shlex.quote(str(value)) for key, value in config_environment.items()},
     )
 
     return rendered
diff --git a/src/srtctl/core/runtime.py b/src/srtctl/core/runtime.py
index 31195ed3..2a65c67b 100644
--- a/src/srtctl/core/runtime.py
+++ b/src/srtctl/core/runtime.py
@@ -242,6 +242,9 @@ def from_config(
         # Add FormattablePath mounts from config.container_mounts
         # These need to be expanded with the runtime context, so we create a
         # temporary context first and then update
+        environment = config.dynamo.get_wheel_environment()
+        environment.update(config.environment)
+
         temp_context = cls(
             job_id=job_id,
             run_name=run_name,
@@ -255,7 +258,7 @@ def from_config(
             network_interface=get_srtslurm_setting("network_interface", "eth0"),
             container_mounts={},
             srun_options=dict(config.srun_options),
-            environment=dict(config.environment),
+            environment=environment,
             is_hf_model=is_hf_model,
         )
 
@@ -278,7 +281,7 @@ def from_config(
             network_interface=get_srtslurm_setting("network_interface", "eth0"),
             container_mounts=container_mounts,
             srun_options=dict(config.srun_options),
-            environment=dict(config.environment),
+            environment=environment,
             is_hf_model=is_hf_model,
         )
 
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index c535be39..a6e84bbd 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -14,6 +14,7 @@
 import builtins
 import itertools
 import logging
+import shlex
 from collections.abc import Iterator, Mapping
 from dataclasses import field
 from enum import Enum
@@ -543,6 +544,7 @@ class BenchmarkConfig:
     num_warmup_mult: int | None = None  # Multiplier for warmup prompts = concurrency * mult (default: 2)
     # Trace replay benchmark fields (uses aiperf with mooncake_trace dataset type)
     trace_file: str | None = None  # Path to trace JSONL file (container path, e.g., /traces/dataset.jsonl)
+    tokenizer_mode: str | None = None  # Tokenizer mode passed to SA-Bench (e.g., "auto", "deepseek_v4")
     custom_tokenizer: str | None = None  # Custom tokenizer class (e.g., "module.path.ClassName")
     use_chat_template: bool = True  # Pass --use-chat-template to benchmark (default: true)
 
@@ -680,7 +682,7 @@ def get_nsys_prefix(self, output_file: str, *, frontend_type: str | None = None)
 class DynamoConfig:
     """Dynamo installation configuration.
 
-    Only one of version, hash, or top_of_tree should be specified.
+    Only one of version, hash, top_of_tree, or wheel should be specified.
     Defaults to version="0.8.0" (pip install).
 
     Options:
@@ -689,31 +691,133 @@ class DynamoConfig:
         version: Install specific version from PyPI (e.g., "0.8.0")
         hash: Clone repo and checkout specific commit hash
         top_of_tree: Clone repo at HEAD (latest)
+        wheel: ai-dynamo package version to install via staged wheels. The
+               matching ai-dynamo-runtime wheel is installed automatically.
 
-    If top_of_tree or hash is set, version is automatically cleared.
+    If top_of_tree, hash, or wheel is set, version is automatically cleared.
     """
 
     install: bool = True
     version: str | None = "0.8.0"
     hash: str | None = None
     top_of_tree: bool = False
+    wheel: str | None = None
 
     def __post_init__(self) -> None:
-        # Auto-clear version if hash or top_of_tree is set
-        if self.hash is not None or self.top_of_tree:
+        install_sources = [
+            ("hash", self.hash is not None),
+            ("top_of_tree", self.top_of_tree),
+            ("wheel", self.wheel is not None),
+        ]
+        enabled_sources = [name for name, enabled in install_sources if enabled]
+
+        # Auto-clear version if another install source is set.
+        if enabled_sources:
             object.__setattr__(self, "version", None)
 
         # Validate only one source option is set
-        if self.hash is not None and self.top_of_tree:
-            raise ValueError("Cannot specify both hash and top_of_tree")
+        if len(enabled_sources) > 1:
+            raise ValueError(f"Cannot specify both Dynamo install sources: {', '.join(enabled_sources)}")
+
+        if self.wheel is not None:
+            if not self.wheel.strip():
+                raise ValueError("dynamo.wheel must be a non-empty package version")
+            if Path(self.wheel).name.endswith(".whl") or "/" in self.wheel:
+                raise ValueError("dynamo.wheel must be a package version like '1.2.0.dev20260426', not a filename")
 
     @property
     def needs_source_install(self) -> bool:
         """Whether this config requires a source install (git clone + maturin)."""
-        return self.hash is not None or self.top_of_tree
+        return self.wheel is None and (self.hash is not None or self.top_of_tree)
+
+    @property
+    def wheel_version(self) -> str | None:
+        """Package version requested for staged wheel installation."""
+        return self.wheel
+
+    @property
+    def wheel_name(self) -> str | None:
+        """Return the ai-dynamo wheel filename for the requested package version."""
+        if not self.wheel:
+            return None
+        return f"ai_dynamo-{self.wheel}-py3-none-any.whl"
+
+    def get_wheel_environment(self) -> dict[str, str]:
+        """Environment variables consumed by ai-dynamo prefetch/setup scripts."""
+        if not self.wheel:
+            return {}
+        wheel_name = self.wheel_name
+        env = {"DYNAMO_WHEEL_NAME": wheel_name} if wheel_name else {}
+        version = self.wheel_version
+        if version:
+            env["DYNAMO_VERSION"] = version
+        return env
+
+    @staticmethod
+    def _source_install_retry_helpers() -> str:
+        """Bash helpers for transient network failures during source installs."""
+        return (
+            "dynamo_retry_git_clone() { "
+            'target="$1"; '
+            'attempts="${DYNAMO_INSTALL_RETRIES:-5}"; '
+            'delay="${DYNAMO_INSTALL_RETRY_DELAY:-10}"; '
+            'max_delay="${DYNAMO_INSTALL_RETRY_MAX_DELAY:-120}"; '
+            'jitter="${DYNAMO_INSTALL_RETRY_JITTER:-5}"; '
+            "attempt=1; "
+            "while true; do "
+            'tmp_target="${target}.clone.$$.$attempt"; '
+            'rm -rf "$target" "$tmp_target"; '
+            'if git clone https://github.com/ai-dynamo/dynamo.git "$tmp_target"; then '
+            'rm -rf "$target" && mv "$tmp_target" "$target" && return 0; '
+            "else "
+            "rc=$?; "
+            "fi; "
+            'rm -rf "$tmp_target"; '
+            'if [ "$attempt" -ge "$attempts" ]; then '
+            'echo "Dynamo git clone failed after $attempts attempts" >&2; '
+            'return "$rc"; '
+            "fi; "
+            'sleep_for="$delay"; '
+            'if [ "$jitter" -gt 0 ]; then sleep_for=$((sleep_for + RANDOM % (jitter + 1))); fi; '
+            'echo "Dynamo git clone failed on attempt $attempt/$attempts (exit $rc); retrying in ${sleep_for}s" >&2; '
+            'sleep "$sleep_for"; '
+            "attempt=$((attempt + 1)); "
+            "delay=$((delay * 2)); "
+            'if [ "$delay" -gt "$max_delay" ]; then delay="$max_delay"; fi; '
+            "done; "
+            "}; "
+        )
 
     def get_install_commands(self) -> str:
         """Get the bash commands to install dynamo."""
+        if self.wheel is not None:
+            wheel_name = self.wheel_name or Path(self.wheel).name
+            wheels_path_shell = shlex.quote(f"/configs/wheels/{wheel_name}")
+            configs_path_shell = shlex.quote(f"/configs/{wheel_name}")
+            version = self.wheel_version
+            if not version:
+                raise ValueError("dynamo.wheel must provide an exact package version")
+            runtime_package = f"ai-dynamo-runtime=={version}"
+            runtime_package_shell = shlex.quote(runtime_package)
+            start_message = shlex.quote(f"Installing ai-dynamo-runtime and ai-dynamo from wheel {wheel_name}...")
+            done_message = shlex.quote(f"ai-dynamo-runtime and ai-dynamo install path completed for {wheel_name}")
+            return (
+                f"echo {start_message} && "
+                "if [ -f /configs/install-ai-dynamo.sh ]; then "
+                "bash /configs/install-ai-dynamo.sh; "
+                f"elif [ -f {wheels_path_shell} ]; then "
+                "python3 -m pip install --pre --no-deps --no-index "
+                f"--find-links /configs/wheels {runtime_package_shell} {wheels_path_shell}; "
+                f"elif [ -f {configs_path_shell} ]; then "
+                "python3 -m pip install --pre --no-deps --no-index "
+                f"--find-links /configs {runtime_package_shell} {configs_path_shell}; "
+                "else "
+                f"echo 'ERROR: exact ai-dynamo wheels for {version} were not found in /configs/wheels or /configs' >&2; "
+                "exit 1; "
+                "fi && "
+                f"echo {done_message}"
+            )
+
         if self.version is not None:
             return (
                 f"echo 'Installing dynamo {self.version}...' && "
@@ -729,7 +833,7 @@ def get_install_commands(self) -> str:
         sglang = (
             "apt-get update -qq && apt-get install -y -qq libclang-dev > /dev/null 2>&1 && "
             "cd /sgl-workspace/ && "
-            "git clone https://github.com/ai-dynamo/dynamo.git && "
+            "dynamo_retry_git_clone dynamo && "
             "cd dynamo && "
             f"{checkout_cmd + ' && ' if checkout_cmd else ''}"
             "cd lib/bindings/python/ && "
@@ -751,7 +855,7 @@ def get_install_commands(self) -> str:
             "if ! command -v maturin &> /dev/null; then "
             "pip install --break-system-packages maturin; fi; fi && "
             "ORIG_DIR=$(pwd) && rm -rf /tmp/dynamo_build && mkdir -p /tmp/dynamo_build && cd /tmp/dynamo_build && "
-            "git clone https://github.com/ai-dynamo/dynamo.git && "
+            "dynamo_retry_git_clone dynamo && "
             "cd dynamo && "
             f"{checkout_cmd + ' && ' if checkout_cmd else ''}"
             "cd lib/bindings/python/ && "
@@ -767,6 +871,7 @@ def get_install_commands(self) -> str:
 
         return (
             f"echo 'Installing dynamo from source ({git_ref})...' && "
+            f"{self._source_install_retry_helpers()}"
             f"if [ -d /sgl-workspace ]; then {sglang}; else {portable}; fi"
         )
 
diff --git a/src/srtctl/core/slurm.py b/src/srtctl/core/slurm.py
index 6b5f4e58..7518ae50 100644
--- a/src/srtctl/core/slurm.py
+++ b/src/srtctl/core/slurm.py
@@ -245,15 +245,16 @@ def start_srun_process(
         # Build bash command with environment setup
         bash_parts = []
 
-        # Add preamble if provided
-        if bash_preamble:
-            bash_parts.append(bash_preamble)
-
-        # Export environment variables
+        # Export environment variables before the preamble so setup scripts can
+        # consume recipe-provided values.
         if env_to_set:
             for name, value in env_to_set.items():
                 bash_parts.append(f"export {name}={shlex.quote(value)}")
 
+        # Add preamble if provided
+        if bash_preamble:
+            bash_parts.append(bash_preamble)
+
         # Add the main command
         bash_parts.append(shlex.join(command))
 
diff --git a/src/srtctl/core/topology.py b/src/srtctl/core/topology.py
index f2a24e5d..472e377d 100644
--- a/src/srtctl/core/topology.py
+++ b/src/srtctl/core/topology.py
@@ -35,8 +35,8 @@ class NodePortAllocator:
     assignments per node and hands out the next available port.
 
     Port ranges (non-overlapping):
-        - kv_events_port: 5550+  (global) - ZMQ port for kv-events publishing
-        - nixl_port:      6550+  (global) - NIXL side channel for KV transfers (vLLM)
+        - kv_events_port: 20000+ (global) - ZMQ port for kv-events publishing
+        - nixl_port:      21000+ (global) - NIXL side channel for KV transfers (vLLM)
         - http_port:      30000+ (per node) - HTTP serving port
         - bootstrap_port: 31000+ (per node) - P/D coordination port (prefill only)
 
@@ -53,8 +53,8 @@ class NodePortAllocator:
 
     base_http_port: int = 30000
     base_bootstrap_port: int = 31000
-    base_kv_events_port: int = 5550
-    base_nixl_port: int = 6550  # NIXL side channel ports (must not overlap with kv_events)
+    base_kv_events_port: int = 20000
+    base_nixl_port: int = 21000  # NIXL side channel ports (must not overlap with kv_events)
 
     _http_ports: dict[str, int] = field(default_factory=dict, repr=False)
     _bootstrap_ports: dict[str, int] = field(default_factory=dict, repr=False)
diff --git a/src/srtctl/frontends/dynamo.py b/src/srtctl/frontends/dynamo.py
index 5e5109a1..48b41ea2 100644
--- a/src/srtctl/frontends/dynamo.py
+++ b/src/srtctl/frontends/dynamo.py
@@ -83,6 +83,10 @@ def start_frontends(
                 "DYN_REQUEST_PLANE": "nats",
             }
 
+            # Add global recipe environment, including values derived from
+            # dynamo.wheel, before frontend-specific overrides.
+            env_to_set.update(runtime.environment)
+
             # Add frontend env from config
             if config.frontend.env:
                 env_to_set.update(config.frontend.env)
diff --git a/src/srtctl/templates/job_script_minimal.j2 b/src/srtctl/templates/job_script_minimal.j2
index 6c0fa9a0..b8c8f50b 100644
--- a/src/srtctl/templates/job_script_minimal.j2
+++ b/src/srtctl/templates/job_script_minimal.j2
@@ -75,18 +75,55 @@ echo "Head node: ${HEAD_NODE}"
 # Set source directory for container mounts (/configs)
 export SRTCTL_SOURCE_DIR="${SRTCTL_SOURCE}"
 
+{% for key, value in config_environment.items() %}
+export {{ key }}={{ value }}
+{% endfor %}
+
 echo ""
 echo "Preparing srtctl environment..."
 
-# Install uv if not present (single binary, no dependencies)
-if ! command -v uv &> /dev/null; then
+# SLURM inherits the submitter environment by default. If srtctl was submitted
+# from an activated virtualenv, that venv can point at a pipless or wrong-arch
+# Python on the compute node. Drop it before selecting Python/uv for the job.
+if [ -n "${VIRTUAL_ENV:-}" ]; then
+    echo "Ignoring inherited virtualenv: ${VIRTUAL_ENV}"
+    CLEAN_PATH=""
+    OLD_IFS="${IFS}"
+    IFS=":"
+    for PATH_ENTRY in ${PATH}; do
+        if [ "${PATH_ENTRY}" != "${VIRTUAL_ENV}/bin" ]; then
+            if [ -z "${CLEAN_PATH}" ]; then
+                CLEAN_PATH="${PATH_ENTRY}"
+            else
+                CLEAN_PATH="${CLEAN_PATH}:${PATH_ENTRY}"
+            fi
+        fi
+    done
+    IFS="${OLD_IFS}"
+    export PATH="${CLEAN_PATH}"
+    unset VIRTUAL_ENV
+fi
+
+# Install a job-local uv so inherited submitter binaries cannot shadow the
+# compute-node architecture.
+UV_BIN_DIR="${OUTPUT_DIR}/uv-bin"
+mkdir -p "${UV_BIN_DIR}"
+if ! "${UV_BIN_DIR}/uv" --version >/dev/null 2>&1; then
     echo "Installing uv package manager..."
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    export PATH="$HOME/.local/bin:$PATH"
+    curl -LsSf https://astral.sh/uv/install.sh | env XDG_BIN_HOME="${UV_BIN_DIR}" INSTALLER_NO_MODIFY_PATH=1 sh
 fi
+export PATH="${UV_BIN_DIR}:$PATH"
 
 echo "Using uv with Python 3.12..."
 
+if [ -n "${DYNAMO_WHEEL_NAME:-}" ] || [ "${SRTCTL_PREFETCH_AI_DYNAMO:-0}" = "1" ]; then
+    if [ -f "${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh" ]; then
+        bash "${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh"
+    else
+        echo "WARNING: ${SRTCTL_SOURCE}/configs/prefetch-ai-dynamo-wheel.sh not found"
+    fi
+fi
+
 {% if setup_script %}
 # Custom setup script override from CLI
 export SRTCTL_SETUP_SCRIPT="{{ setup_script }}"
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
index c15759b2..5a2b2d47 100644
--- a/tests/test_benchmarks.py
+++ b/tests/test_benchmarks.py
@@ -77,6 +77,36 @@ def test_validate_config_valid(self):
         errors = runner.validate_config(config)
         assert errors == []
 
+    def test_build_command_includes_tokenizer_mode(self):
+        """Passes tokenizer mode through to the SA-Bench script."""
+        from unittest.mock import MagicMock
+
+        from srtctl.benchmarks.sa_bench import SABenchRunner
+        from srtctl.core.schema import BenchmarkConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        runner = SABenchRunner()
+        runtime = MagicMock()
+        runtime.frontend_port = 8000
+        runtime.is_hf_model = False
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model", container="/image", precision="fp4"),
+            resources=ResourceConfig(gpu_type="h100"),
+            benchmark=BenchmarkConfig(
+                type="sa-bench",
+                isl=1024,
+                osl=1024,
+                concurrencies=[4, 8],
+                tokenizer_mode="deepseek_v4",
+                use_chat_template=True,
+            ),
+        )
+
+        cmd = runner.build_command(config, runtime)
+
+        assert cmd[-3:] == ["", "true", "deepseek_v4"]
+
 
 class TestSGLangBenchRunner:
     """Test SGLang-Bench runner."""
diff --git a/tests/test_configs.py b/tests/test_configs.py
index 0b4138d5..1c08d10f 100644
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
@@ -125,6 +125,14 @@ def test_hash_install_command(self):
         assert config.needs_source_install
         cmd = config.get_install_commands()
         assert "git clone" in cmd
+        assert "dynamo_retry_git_clone dynamo" in cmd
+        assert "DYNAMO_INSTALL_RETRIES:-5" in cmd
+        assert "DYNAMO_INSTALL_RETRY_DELAY:-10" in cmd
+        assert "DYNAMO_INSTALL_RETRY_MAX_DELAY:-120" in cmd
+        assert "DYNAMO_INSTALL_RETRY_JITTER:-5" in cmd
+        assert "RANDOM % (jitter + 1)" in cmd
+        assert 'rm -rf "$target" "$tmp_target"' in cmd
+        assert "else rc=$?; fi" in cmd
         assert "git checkout abc123" in cmd
         assert "maturin build" in cmd
         assert "if [ -d /sgl-workspace ]" in cmd
@@ -133,6 +141,60 @@ def test_hash_install_command(self):
         assert "if ! command -v cargo" in cmd
         assert "if ! command -v maturin" in cmd
 
+    def test_wheel_install_command(self):
+        """Wheel config installs ai-dynamo plus runtime without source build."""
+        from srtctl.core.schema import DynamoConfig
+
+        config = DynamoConfig(wheel="1.2.0.dev20260426")
+        cmd = config.get_install_commands()
+
+        assert config.version is None
+        assert config.needs_source_install is False
+        assert "install-ai-dynamo.sh" in cmd
+        assert "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl" in cmd
+        assert "--no-deps" in cmd
+        assert "ai-dynamo-runtime==1.2.0.dev20260426" in cmd
+        assert "--find-links /configs/wheels" in cmd
+        assert "--find-links /configs" in cmd
+        assert "--extra-index-url" not in cmd
+        assert "were not found" in cmd
+        assert "maturin" not in cmd
+        assert "git clone" not in cmd
+
+    def test_source_install_clone_retry_helper_retries_and_cleans_partial_clone(self, tmp_path):
+        """Clone helper retries transient failures and cleans partial clone directories."""
+        import subprocess
+
+        from srtctl.core.schema import DynamoConfig
+
+        script = f"""
+set -euo pipefail
+{DynamoConfig._source_install_retry_helpers()}
+git() {{
+    count=0
+    if [ -f attempts ]; then count=$(cat attempts); fi
+    count=$((count + 1))
+    echo "$count" > attempts
+    mkdir -p "$3"
+    echo "attempt-$count" > "$3/marker"
+    if [ "$count" -lt 3 ]; then
+        return 22
+    fi
+    return 0
+}}
+export DYNAMO_INSTALL_RETRIES=4
+export DYNAMO_INSTALL_RETRY_DELAY=0
+export DYNAMO_INSTALL_RETRY_JITTER=0
+dynamo_retry_git_clone dynamo
+test "$(cat attempts)" = "3"
+test "$(cat dynamo/marker)" = "attempt-3"
+if find . -maxdepth 1 -type d -name 'dynamo.clone.*' | grep -q .; then
+    echo "leftover temp clone" >&2
+    exit 1
+fi
+"""
+        subprocess.run(["bash", "-c", script], cwd=tmp_path, check=True, capture_output=True, text=True)
+
     def test_top_of_tree_install_command(self):
         """Top-of-tree config generates source install without checkout."""
         from srtctl.core.schema import DynamoConfig
@@ -156,6 +218,40 @@ def test_hash_and_top_of_tree_not_allowed(self):
         with pytest.raises(ValueError, match="Cannot specify both"):
             DynamoConfig(hash="abc123", top_of_tree=True)
 
+    def test_hash_and_wheel_not_allowed(self):
+        """Cannot specify both hash and wheel."""
+        from srtctl.core.schema import DynamoConfig
+
+        with pytest.raises(ValueError, match="Cannot specify both"):
+            DynamoConfig(hash="abc123", wheel="1.2.0.dev20260426")
+
+    def test_wheel_filename_not_allowed(self):
+        """Wheel config takes a package version, not an artifact filename."""
+        from srtctl.core.schema import DynamoConfig
+
+        with pytest.raises(ValueError, match="package version"):
+            DynamoConfig(wheel="ai_dynamo-1.2.0.dev20260426-py3-none-any.whl")
+
+    def test_wheel_version_required(self):
+        """Wheel config must provide an exact package version."""
+        from srtctl.core.schema import DynamoConfig
+
+        with pytest.raises(ValueError, match="non-empty package version"):
+            DynamoConfig(wheel="")
+
+    def test_wheel_environment_from_version(self):
+        """Wheel version is converted to setup/prefetch environment."""
+        from srtctl.core.schema import DynamoConfig
+
+        config = DynamoConfig(wheel="1.2.0.dev20260426")
+
+        assert config.wheel_version == "1.2.0.dev20260426"
+        assert config.wheel_name == "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl"
+        assert config.get_wheel_environment() == {
+            "DYNAMO_VERSION": "1.2.0.dev20260426",
+            "DYNAMO_WHEEL_NAME": "ai_dynamo-1.2.0.dev20260426-py3-none-any.whl",
+        }
+
 
 class TestSGLangProtocol:
     """Tests for SGLangProtocol."""
@@ -479,6 +575,32 @@ def test_sbatch_template_includes_setup_script_env_var(self):
         )
         assert 'export SRTCTL_SETUP_SCRIPT="install-sglang-main.sh"' in script
 
+    def test_sbatch_template_prefetches_dynamo_wheel(self):
+        """Test that dynamo.wheel is exported and prefetched before orchestrator launch."""
+        from pathlib import Path
+
+        from srtctl.cli.submit import generate_minimal_sbatch_script
+        from srtctl.core.schema import DynamoConfig, ModelConfig, ResourceConfig, SrtConfig
+
+        config = SrtConfig(
+            name="test",
+            model=ModelConfig(path="/model", container="/container.sqsh", precision="fp8"),
+            resources=ResourceConfig(gpu_type="h100", gpus_per_node=8, agg_nodes=1),
+            dynamo=DynamoConfig(
+                install=True,
+                wheel="1.2.0.dev20260426",
+            ),
+        )
+
+        script = generate_minimal_sbatch_script(config, Path("/tmp/test.yaml"))
+
+        assert "export DYNAMO_VERSION=1.2.0.dev20260426" in script
+        assert "export DYNAMO_WHEEL_NAME=ai_dynamo-1.2.0.dev20260426-py3-none-any.whl" in script
+        assert "Ignoring inherited virtualenv" in script
+        assert 'unset VIRTUAL_ENV' in script
+        assert 'UV_BIN_DIR="${OUTPUT_DIR}/uv-bin"' in script
+        assert "configs/prefetch-ai-dynamo-wheel.sh" in script
+
     def test_setup_script_env_var_override(self, monkeypatch):
         """Test that SRTCTL_SETUP_SCRIPT env var overrides config."""
         import os
@@ -1323,8 +1445,6 @@ def test_connector_lmcache_generates_kv_transfer_config(self):
 
     def test_connector_custom_json_passthrough(self):
         """connector set to a raw JSON string is passed through as-is."""
-        import json
-
         custom = '{"kv_connector":"MyCustomConnector","kv_role":"kv_both"}'
         cmd = self._build_cmd_with_connector(custom)
         idx = cmd.index("--kv-transfer-config")