NVIDIA · ishandhanani · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -0,0 +1,21 @@
+# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB200
+
+NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro** (MXFP4) on **GB200**
+(ARM64 Grace + Blackwell, 4 GPU per node), aggregated mode, 1k / 1k workload.
+GB200 HBM per GPU is smaller than GB300, so the 1.6T MXFP4 checkpoint only fits
+across **2 nodes (8 GPUs) at TP=8**.
+
+## Container
+
+Same Grace+Blackwell aarch64 image as GB300 (shared enroot sqsh alias
+`dsv4-grace-blackwell` in `srtslurm.yaml.example`).
+
+## Recipes
+
+| file | parallelism | MTP | notes |
+|---|---|---|---|
+| `agg-2n-low-latency.yaml` | TP=8 | EAGLE 3/4 | low-latency, 2-node |
+| `agg-2n-nomtp.yaml`       | TP=8 | —         | throughput, 2-node  |
+
+See `recipes/gb300-fp4/1k1k-dsv4/README.md` for the full flag rationale —
+flags are identical to the GB300 2-node recipes apart from the partition.
@@ -0,0 +1,61 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb200-2n-agg-ll-1k1k-official"
+
+slurm:
+  partition: gb200
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
@@ -0,0 +1,57 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb200-2n-agg-nomtp-1k1k-official"
+
+slurm:
+  partition: gb200
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
@@ -0,0 +1,53 @@
+# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB300
+
+This directory contains NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro**
+(1.6T-parameter MoE with MXFP4 MoE weights + FP8 KV, UE8M0 scales) on **GB300**
+(ARM64 Grace + Blackwell, 4 GPU per node), aggregated serving mode, 1024 input /
+1024 output workload.
+
+## Container
+
+All recipes reference the `dsv4-grace-blackwell` alias defined in
+`srtslurm.yaml.example`. Pull + convert:
+
+```bash
+enroot import --output sglang-deepseek-v4-grace-blackwell.sqsh \
+  docker://lmsysorg/sglang:deepseek-v4-grace-blackwell
+```
+
+(Use the `deepseek-v4-blackwell` image for B200 x86_64, or `deepseek-v4-hopper` for H200.)
+
+## Model checkpoint
+
+```bash
+hf download deepseek-ai/DeepSeek-V4-Pro --local-dir /shared/models/deepseek/DeepSeek-V4-Pro
+```
+
+## Recipes
+
+| file | parallelism | MTP | target | notes |
+|---|---|---|---|---|
+| `agg-low-latency.yaml`  | TP=4                        | EAGLE 3/4 | minimum TPOT / best per-user latency | GB300 1 node |
+| `agg-nomtp.yaml`        | TP=4                        | —         | baseline throughput, no spec decoding | GB300 1 node |
+| `agg-balanced-tep.yaml` | TP=4 + DP=4 + DP-attn + DeepEP | EAGLE 1/2 | Pareto mid-curve                     | GB300 1 node |
+| `agg-max-tpt-tep.yaml`  | TP=4 + DP=4 + DP-attn + DeepEP | —         | maximum TPS/GPU                      | GB300 1 node |
+| `agg-2n-low-latency.yaml` | TP=8                      | EAGLE 3/4 | low-latency, 2× memory headroom     | GB300 2 nodes |
+| `agg-2n-nomtp.yaml`     | TP=8                        | —         | throughput, 2× memory headroom       | GB300 2 nodes |
+
+## Key flags (derived from the SGLang DSv4 cookbook)
+
+- `moe-runner-backend: flashinfer_mxfp4` — MXFP4 MoE kernels (Blackwell only).
+- `chunked-prefill-size: 4096` + `disable-flashinfer-autotune: true` — cookbook recipe.
+- `disable-radix-cache: true` — synthetic benchmark best practice; also
+  reduces contiguous-allocator fragmentation at weight-reorder time.
+- `mem-fraction-static: 0.78` — leaves headroom for the MXFP4
+  `reorder_w1w3_to_w3w1` path (0.82 intermittently OOMs on GB300).
+- TEP recipes: `enable-dp-attention + moe-a2a-backend: deepep` plus
+  `deepep-config num_sms=96` (DeepEP `DEEPEP_LARGE_SMS_FLAG` for single-node
+  Blackwell per cookbook).
+
+## References
+
+- [SGLang cookbook: `docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx`](https://github.com/sgl-project/sglang/blob/main/docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx)
+- [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro)
+- Upstream SGLang PR: sgl-project/sglang#23600
@@ -0,0 +1,62 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb300-2n-agg-ll-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
@@ -0,0 +1,58 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb300-2n-agg-nomtp-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
@@ -0,0 +1,72 @@
+# DeepSeek-V4-Pro GB300 1n TEP 'balanced' recipe
+# From SGLang cookbook (DeepSeek-V4.mdx / deepseek-v4-deployment.jsx):
+#   TP=4 + DP=4 + DP-attention + DeepEP + MTP 1/2 + cg=128 max-run=256
+# Tests TEP vs pure-TP speedup at medium concurrency.
+name: "dsv4-pro-gb300-agg-balanced-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      # TEP: TP=DP=4 + dp-attention + deepep
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      # MTP 1/2 (balanced - gentler than 3/4)
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 1
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 2
+
+      mem-fraction-static: 0.9
+      context-length: 2200
+      max-prefill-tokens: 4096
+      cuda-graph-max-bs: 64
+      max-running-requests: 64
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "32x64"
+  req_rate: "inf"
+  use_chat_template: false