NVIDIA · ishandhanani · Apr 28, 2026 · Apr 25, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -0,0 +1,53 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Install SGLang with the DeepSeek-V4 NIXL state-buffer transport fix.
+# Remove this once https://github.com/sgl-project/sglang/pull/23773 is merged upstream.
+
+set -euo pipefail
+
+SGLANG_DIR="${SGLANG_DIR:-/sgl-workspace/sglang}"
+SGLANG_REMOTE="${SGLANG_REMOTE:-https://github.com/sgl-project/sglang.git}"
+SGLANG_PR_NUMBER="${SGLANG_PR_NUMBER:-23773}"
+SGLANG_PR_REF="refs/pull/${SGLANG_PR_NUMBER}/head"
+SGLANG_LOCAL_BRANCH="${SGLANG_LOCAL_BRANCH:-nixl-dsv4-pr-${SGLANG_PR_NUMBER}}"
+
+echo "=== Installing SGLang NIXL DSV4 fix from PR #${SGLANG_PR_NUMBER} ==="
+
+if command -v flock >/dev/null 2>&1; then
+    mkdir -p /tmp/srt-slurm-locks
+    exec 9>/tmp/srt-slurm-locks/install-nixl-dsv4.lock
+    flock 9
+fi
+
+mkdir -p "$(dirname "$SGLANG_DIR")"
+
+if [ ! -d "$SGLANG_DIR/.git" ]; then
+    echo "Recreating $SGLANG_DIR from $SGLANG_REMOTE"
+    rm -rf "$SGLANG_DIR"
+    git clone --depth 1 "$SGLANG_REMOTE" "$SGLANG_DIR"
+fi
+
+cd "$SGLANG_DIR"
+
+git config --global --add safe.directory "$SGLANG_DIR" || true
+
+if git remote get-url origin >/dev/null 2>&1; then
+    git remote set-url origin "$SGLANG_REMOTE"
+else
+    git remote add origin "$SGLANG_REMOTE"
+fi
+
+git fetch --depth 1 origin "$SGLANG_PR_REF"
+git checkout -f -B "$SGLANG_LOCAL_BRANCH" FETCH_HEAD
+
+INSTALLED_COMMIT="$(git rev-parse HEAD)"
+echo "Checked out SGLang PR #${SGLANG_PR_NUMBER} at ${INSTALLED_COMMIT}"
+
+NIXL_CONN="python/sglang/srt/disaggregation/nixl/conn.py"
+if ! grep -q "send_state" "$NIXL_CONN" || ! grep -q "state_data_ptrs" "$NIXL_CONN"; then
+    echo "ERROR: expected NIXL state-buffer transport changes were not found in $NIXL_CONN" >&2
+    exit 1
+fi
+
+echo "=== SGLang NIXL DSV4 fix installed ==="
@@ -0,0 +1,111 @@
+# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k on GB300
+
+This directory contains NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro**
+(1.6T-parameter MoE with MXFP4 MoE weights + FP8 KV, UE8M0 scales) on **GB300**
+(ARM64 Grace + Blackwell, 4 GPU per node), 1024 input / 1024 output workload.
+Both **aggregated** (single-node SGLang) and **disaggregated** (1P+1D dynamo +
+NIXL) serving modes are covered.
+
+## Container
+
+All recipes reference the `dsv4-grace-blackwell` alias defined in
+`srtslurm.yaml.example`. Pull + convert:
+
+```bash
+enroot import --output sglang-deepseek-v4-grace-blackwell.sqsh \
+  docker://lmsysorg/sglang:deepseek-v4-grace-blackwell
+```
+
+(Use the `deepseek-v4-blackwell` image for B200 x86_64, or `deepseek-v4-hopper` for H200.)
+
+## Model checkpoint
+
+```bash
+hf download deepseek-ai/DeepSeek-V4-Pro --local-dir /shared/models/deepseek/DeepSeek-V4-Pro
+```
+
+## Recipes
+
+### Aggregated (single SGLang server)
+
+| file | parallelism | MTP | target | notes |
+|---|---|---|---|---|
+| `agg-low-latency.yaml`  | TP=4                        | EAGLE 3/4 | minimum TPOT / best per-user latency | GB300 1 node |
+| `agg-nomtp.yaml`        | TP=4                        | —         | baseline throughput, no spec decoding | GB300 1 node |
+| `agg-balanced-tep.yaml` | TP=4 + DP=4 + DP-attn + DeepEP | EAGLE 1/2 | Pareto mid-curve                     | GB300 1 node |
+| `agg-max-tpt-tep.yaml`  | TP=4 + DP=4 + DP-attn + DeepEP | —         | maximum TPS/GPU                      | GB300 1 node |
+| `agg-2n-low-latency.yaml` | TP=8                      | EAGLE 3/4 | low-latency, 2× memory headroom     | GB300 2 nodes |
+| `agg-2n-nomtp.yaml`     | TP=8                        | —         | throughput, 2× memory headroom       | GB300 2 nodes |
+
+### Disaggregated (dynamo frontend, NIXL KV transfer)
+
+> ⚠️ **Required SGLang patch (upstreaming in flight).** All disagg
+> recipes below depend on a fix to `python/sglang/srt/disaggregation/nixl/conn.py`
+> that registers and transfers the model's auxiliary state buffers
+> (SWA / NSA / Mamba) alongside the KV cache. Without this patch the NIXL
+> backend silently drops the state buffer, causing decode-side accuracy
+> to collapse on DSv4-Pro (GSM8K ≈ 0.13 vs 1.00 with the patch) even
+> though throughput numbers look healthy. The fix mirrors what the
+> Mooncake backend already does; an upstream sglang PR is being prepared
+> separately. Until it lands, point your `dsv4-grace-blackwell` container
+> at a build with the patch applied (mounting the patched
+> `python/sglang/srt/disaggregation/nixl/` over the container path is
+> sufficient). The recipes themselves intentionally do **not** declare
+> any local mounts — pick up the patch via your container build process.
+>
+> Performance numbers in the table further down were measured against a
+> patched build; they should reproduce on any build that includes the
+> equivalent fix.
+
+`XPYD` in the table below denotes **X prefill nodes + Y decode nodes**
+(one SGLang worker per role per node, NOT per-instance counts). Each
+GB300 node has 4 GPUs, so e.g. 2P2D DEP=8 = 16 GPUs total.
+
+| file | topology | parallelism | MoE backend | target | notes |
+|---|---|---|---|---|---|
+| `disagg-1p1d-tp4-mxfp4.yaml`            | 1P+1D (2 nodes / 8 GPU)  | both TP=4                       | flashinfer_mxfp4 | low-latency, low/medium concurrency       | TP-only baseline |
+| `disagg-1p1d-dep4-mega-moe.yaml`        | 1P+1D (2 nodes / 8 GPU)  | both TP=4 + DP=4 + DeepEP       | mega_moe (DeepGEMM) | DEP throughput Pareto reference        | TEP topology, mirrors `agg-max-tpt-tep.yaml` split across 2 nodes |
+| `disagg-1p2d-dep4-to-dep8-mega-moe.yaml`| 1P+2D (3 nodes / 12 GPU) | P: TP=4+DP=4; D: TP=8+DP=8 + DeepEP | mega_moe (DeepGEMM) | **best per-GPU efficiency** for decode-heavy 1k/1k | asymmetric — decode EP domain doubled |
+| `disagg-2p2d-dep8-mega-moe.yaml`        | 2P+2D (4 nodes / 16 GPU) | both TP=8 + DP=8 + DeepEP       | mega_moe (DeepGEMM) | largest DEP throughput config             | symmetric counterpart to the 1P2D recipe |
+| `disagg-2p2d-tp8-mxfp4.yaml`            | 2P+2D (4 nodes / 16 GPU) | both TP=8                       | flashinfer_mxfp4 | TP-only 4-node baseline                    | quantifies the DEP+DeepEP uplift on GB300 |
+
+Multi-node decode recipes intentionally do NOT set
+`SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2`: CAR_V2 is single-node only and
+silently corrupts results when used across nodes.
+
+#### Verified throughput (sa-bench, isl=osl=1024, random_range_ratio=0.8)
+
+Peak Total TPS / GPU at the saturation point of each curve (lower-conc
+points trade throughput for latency; full Pareto curves available on
+request):
+
+| recipe | GPUs | peak conc | Output TPS | Total TPS / GPU | Mean TTFT | Mean TPOT |
+|---|---:|---:|---:|---:|---:|---:|
+| `disagg-1p1d-tp4-mxfp4.yaml`             |  8 |  128 |  3,349 |   838 |  1.05 s | 36.1 ms |
+| `disagg-1p1d-dep4-mega-moe.yaml`         |  8 |  128 |  3,293 |   824 |  0.88 s | 36.8 ms |
+| `disagg-2p2d-tp8-mxfp4.yaml`             | 16 |  512 |  6,863 |   857 |  2.26 s | 70.2 ms |
+| `disagg-2p2d-dep8-mega-moe.yaml`         | 16 | 2,048 | 32,840 | 4,104 |  2.12 s | 58.2 ms |
+| `disagg-1p2d-dep4-to-dep8-mega-moe.yaml` | 12 | 2,048 | 33,442 | **5,572** |  4.26 s | 53.8 ms |
+
+Headline: the asymmetric 1P2D DEP4→DEP8 config delivers the highest
+**per-GPU** total throughput because at 1k/1k the workload is
+decode-heavy, so doubling the decode EP domain (4 → 8 GPUs, 256 → 32
+experts/GPU) buys far more than scaling prefill.
+
+## Key flags (derived from the SGLang DSv4 cookbook)
+
+- `moe-runner-backend: flashinfer_mxfp4` — MXFP4 MoE kernels (Blackwell only).
+- `chunked-prefill-size: 4096` + `disable-flashinfer-autotune: true` — cookbook recipe.
+- `disable-radix-cache: true` — synthetic benchmark best practice; also
+  reduces contiguous-allocator fragmentation at weight-reorder time.
+- `mem-fraction-static: 0.78` — leaves headroom for the MXFP4
+  `reorder_w1w3_to_w3w1` path (0.82 intermittently OOMs on GB300).
+- TEP recipes: `enable-dp-attention + moe-a2a-backend: deepep` plus
+  `deepep-config num_sms=96` (DeepEP `DEEPEP_LARGE_SMS_FLAG` for single-node
+  Blackwell per cookbook).
+
+## References
+
+- [SGLang cookbook: `docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx`](https://github.com/sgl-project/sglang/blob/main/docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx)
+- [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro)
+- Upstream SGLang PR: sgl-project/sglang#23600
@@ -1,4 +1,4 @@
-# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8)
 name: "dsv4-pro-gb300-2n-agg-nomtp-1k1k-official"
 
 slurm:

@@ -0,0 +1,126 @@
+# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, DEP=4, mega_moe) — dynamo frontend.
+#
+# 1 prefill node + 1 decode node, each TP=4 + DP=4 + DP-attention + DeepEP
+# (the same TEP topology as the agg-balanced-tep / agg-max-tpt-tep recipes
+# in this directory, but split across two nodes). Frontend is dynamo with
+# the nginx fan-in container; KV transfer over NIXL.
+#
+# Throughput-oriented disagg counterpart to disagg-1p1d-tp4-mxfp4.yaml:
+# DEP4 + mega_moe (DeepEP-based MoE all-to-all) gives roughly the same
+# peak throughput at conc=128 as the pure-TP4 disagg recipe, but holds
+# that throughput further into the saturation regime because DeepEP keeps
+# MoE compute well-balanced.
+name: "dsv4-pro-gb300-disagg-1p1d-dep4-mega-moe-1k1k"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+frontend:
+  type: dynamo
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  decode_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+    SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+    SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+    SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+    SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+    SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+    SGLANG_OPT_USE_FAST_MASK_EP: "1"
+    SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+    SGLANG_OPT_USE_JIT_NORM: "1"
+    SGLANG_OPT_USE_TOPK_V2: "1"
+    SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      # DEP4: TP=4 + DP=4 + DP-attention + DeepEP (mega_moe path)
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      mem-fraction-static: 0.90
+      max-running-requests: 1024
+      cuda-graph-max-bs: 1024
+      chunked-prefill-size: 32768
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "4x8x16x32x64x128x256x512x1024x1536x2048"
+  req_rate: "inf"
+  use_chat_template: false
@@ -0,0 +1,82 @@
+# DeepSeek-V4-Pro disaggregated on GB300 (1P1D, TP=4, MXFP4) — dynamo frontend.
+#
+# 1 prefill node + 1 decode node, each TP=4 on a single GB300 (4 GPUs).
+# Frontend is dynamo with an nginx fan-in container; KV transfer over NIXL.
+# Companion to the agg-* recipes in this directory: same model + workload, same
+# MXFP4 MoE kernels, but split prefill / decode across two nodes for steady
+# decode TPOT under high concurrency.
+name: "dsv4-pro-gb300-disagg-1p1d-tp4-mxfp4-1k1k"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+frontend:
+  type: dynamo
+  # `nginx` resolves to nginx:latest from Docker Hub via enroot — no local alias
+  # required. dynamo uses it as the fan-in / health-check entrypoint.
+  nginx_container: nginx
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "mxfp4"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  type: sglang
+
+  prefill_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  decode_environment:
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 4
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      moe-runner-backend: "flashinfer_mxfp4"
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.90
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+      chunked-prefill-size: 8192
+      disable-radix-cache: true
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  # Low-latency band only — TP4 1P1D saturates near conc=128 on GB300;
+  # for high-concurrency Pareto use the DEP variants.
+  concurrencies: "4x8x16x32x64x128"
+  req_rate: "inf"
+  use_chat_template: false