diff --git a/recipes/gb200-fp4/1k1k-dsv4/README.md b/recipes/gb200-fp4/1k1k-dsv4/README.md
new file mode 100644
index 00000000..5208fbca
--- /dev/null
+++ b/recipes/gb200-fp4/1k1k-dsv4/README.md
@@ -0,0 +1,21 @@
+# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB200
+
+NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro** (MXFP4) on **GB200**
+(ARM64 Grace + Blackwell, 4 GPU per node), aggregated mode, 1k / 1k workload.
+GB200 HBM per GPU is smaller than GB300, so the 1.6T MXFP4 checkpoint only fits
+across **2 nodes (8 GPUs) at TP=8**.
+
+## Container
+
+Same Grace+Blackwell aarch64 image as GB300 (shared enroot sqsh alias
+`dsv4-grace-blackwell` in `srtslurm.yaml.example`).
+
+## Recipes
+
+| file | parallelism | MTP | notes |
+|---|---|---|---|
+| `agg-2n-low-latency.yaml` | TP=8 | EAGLE 3/4 | low-latency, 2-node |
+| `agg-2n-nomtp.yaml`       | TP=8 | —         | throughput, 2-node  |
+
+See `recipes/gb300-fp4/1k1k-dsv4/README.md` for the full flag rationale —
+flags are identical to the GB300 2-node recipes apart from the partition.
diff --git a/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml
new file mode 100644
index 00000000..cb2e71a1
--- /dev/null
+++ b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml
@@ -0,0 +1,65 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb200-2n-agg-ll-1k1k-official"
+
+slurm:
+  partition: gb200
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb200/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb200/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml
new file mode 100644
index 00000000..570e54e9
--- /dev/null
+++ b/recipes/gb200-fp4/1k1k-dsv4/agg-2n-nomtp.yaml
@@ -0,0 +1,61 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb200-2n-agg-nomtp-1k1k-official"
+
+slurm:
+  partition: gb200
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb200"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb200/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb200/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/README.md b/recipes/gb300-fp4/1k1k-dsv4/README.md
new file mode 100644
index 00000000..36fb3ebb
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/README.md
@@ -0,0 +1,53 @@
+# DeepSeek-V4-Pro (1.6T MoE, MXFP4) — 1k/1k aggregated on GB300
+
+This directory contains NVIDIA-verified SGLang recipes for **DeepSeek-V4-Pro**
+(1.6T-parameter MoE with MXFP4 MoE weights + FP8 KV, UE8M0 scales) on **GB300**
+(ARM64 Grace + Blackwell, 4 GPU per node), aggregated serving mode, 1024 input /
+1024 output workload.
+
+## Container
+
+All recipes reference the `dsv4-grace-blackwell` alias defined in
+`srtslurm.yaml.example`. Pull + convert:
+
+```bash
+enroot import --output sglang-deepseek-v4-grace-blackwell.sqsh \
+  docker://lmsysorg/sglang:deepseek-v4-grace-blackwell
+```
+
+(Use the `deepseek-v4-blackwell` image for B200 x86_64, or `deepseek-v4-hopper` for H200.)
+
+## Model checkpoint
+
+```bash
+hf download deepseek-ai/DeepSeek-V4-Pro --local-dir /shared/models/deepseek/DeepSeek-V4-Pro
+```
+
+## Recipes
+
+| file | parallelism | MTP | target | notes |
+|---|---|---|---|---|
+| `agg-low-latency.yaml`  | TP=4                        | EAGLE 3/4 | minimum TPOT / best per-user latency | GB300 1 node |
+| `agg-nomtp.yaml`        | TP=4                        | —         | baseline throughput, no spec decoding | GB300 1 node |
+| `agg-balanced-tep.yaml` | TP=4 + DP=4 + DP-attn + DeepEP | EAGLE 1/2 | Pareto mid-curve                     | GB300 1 node |
+| `agg-max-tpt-tep.yaml`  | TP=4 + DP=4 + DP-attn + DeepEP | —         | maximum TPS/GPU                      | GB300 1 node |
+| `agg-2n-low-latency.yaml` | TP=8                      | EAGLE 3/4 | low-latency, 2× memory headroom     | GB300 2 nodes |
+| `agg-2n-nomtp.yaml`     | TP=8                        | —         | throughput, 2× memory headroom       | GB300 2 nodes |
+
+## Key flags (derived from the SGLang DSv4 cookbook)
+
+- `moe-runner-backend: flashinfer_mxfp4` — MXFP4 MoE kernels (Blackwell only).
+- `chunked-prefill-size: 4096` + `disable-flashinfer-autotune: true` — cookbook recipe.
+- `disable-radix-cache: true` — synthetic benchmark best practice; also
+  reduces contiguous-allocator fragmentation at weight-reorder time.
+- `mem-fraction-static: 0.78` — leaves headroom for the MXFP4
+  `reorder_w1w3_to_w3w1` path (0.82 intermittently OOMs on GB300).
+- TEP recipes: `enable-dp-attention + moe-a2a-backend: deepep` plus
+  `deepep-config num_sms=96` (DeepEP `DEEPEP_LARGE_SMS_FLAG` for single-node
+  Blackwell per cookbook).
+
+## References
+
+- [SGLang cookbook: `docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx`](https://github.com/sgl-project/sglang/blob/main/docs/cookbook/autoregressive/DeepSeek/DeepSeek-V4.mdx)
+- [DeepSeek-V4-Pro model card](https://huggingface.co/deepseek-ai/DeepSeek-V4-Pro)
+- Upstream SGLang PR: sgl-project/sglang#23600
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml
new file mode 100644
index 00000000..35a4fc90
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-low-latency.yaml
@@ -0,0 +1,66 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb300-2n-agg-ll-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml
new file mode 100644
index 00000000..285ae192
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-2n-nomtp.yaml
@@ -0,0 +1,62 @@
+# DeepSeek-V4-Pro aggregated on GB300 2 nodes (TP=8) - MTP enabled
+name: "dsv4-pro-gb300-2n-agg-nomtp-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 2
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+      tensor-parallel-size: 8
+
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.82
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml
new file mode 100644
index 00000000..740e48fd
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-balanced-tep.yaml
@@ -0,0 +1,74 @@
+# DeepSeek-V4-Pro GB300 1n TEP 'balanced' recipe
+# From SGLang cookbook (DeepSeek-V4.mdx / deepseek-v4-deployment.jsx):
+#   TP=4 + DP=4 + DP-attention + DeepEP + MTP 1/2 + cg=128 max-run=256
+# Tests TEP vs pure-TP speedup at medium concurrency.
+name: "dsv4-pro-gb300-agg-balanced-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      # TEP: TP=DP=4 + dp-attention + deepep
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      # MTP 1/2 (balanced - gentler than 3/4)
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 1
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 2
+
+      mem-fraction-static: 0.78
+      context-length: 2200
+      cuda-graph-max-bs: 128
+      max-running-requests: 256
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml
new file mode 100644
index 00000000..f8bc23df
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-low-latency.yaml
@@ -0,0 +1,71 @@
+# DeepSeek-V4-Pro aggregated mode on GB300 (1 node 4 GPU, TP=4)
+# Based on SGLang upstream dsv4-docs cookbook (b200|big|low-latency verified)
+# Adapted to GB300 per DeepSeek-V4.mdx: "GB300 4 GPU" is the single-node config
+name: "dsv4-pro-gb300-agg-ll-1k1k"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+
+      # V4 low-latency recipe: MXFP4 MoE + MTP 3/4 + chunked-prefill 4096
+      moe-runner-backend: "flashinfer_mxfp4"
+      speculative-algo: "EAGLE"
+      speculative-num-steps: 3
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 4
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.78
+
+      # Generic knobs
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml
new file mode 100644
index 00000000..66d4e405
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-max-tpt-tep.yaml
@@ -0,0 +1,70 @@
+# DeepSeek-V4-Pro GB300 1n TEP 'max-throughput' recipe
+# From SGLang cookbook (DeepSeek-V4.mdx / deepseek-v4-deployment.jsx):
+#   TP=4 + DP=4 + DP-attention + DeepEP, NO MTP + cg=128 max-run=256
+# Tests TEP peak throughput at high concurrency.
+name: "dsv4-pro-gb300-agg-maxtpt-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      # TEP: TP=DP=4 + dp-attention + deepep
+      tensor-parallel-size: 4
+      data-parallel-size: 4
+      enable-dp-attention: true
+      moe-a2a-backend: "deepep"
+      deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+      # NO MTP for max-tpt
+
+      mem-fraction-static: 0.78
+      context-length: 2200
+      cuda-graph-max-bs: 128
+      max-running-requests: 256
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml b/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml
new file mode 100644
index 00000000..96b4c690
--- /dev/null
+++ b/recipes/gb300-fp4/1k1k-dsv4/agg-nomtp.yaml
@@ -0,0 +1,65 @@
+# DeepSeek-V4-Pro aggregated mode on GB300 (1 node 4 GPU, TP=4) - NO MTP
+# Baseline for comparison against MTP-enabled run (1567486)
+name: "dsv4-pro-gb300-agg-nomtp-1k1k-official"
+
+slurm:
+  partition: gb300
+  time_limit: "4:00:00"
+
+model:
+  path: "dsv4-pro"
+  container: "dsv4-grace-blackwell"
+  precision: "fp4"
+
+frontend:
+  type: sglang
+
+resources:
+  gpu_type: "gb300"
+  gpus_per_node: 4
+  agg_nodes: 1
+  agg_workers: 1
+
+backend:
+  type: sglang
+
+  aggregated_environment:
+    PYTHONUNBUFFERED: "1"
+    SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    # Persistent JIT caches (recommended for repeatable runs).
+    # Mount your own cache dirs into /configs/** via srt-slurm extra_mount
+    # or change these to any writable path inside the container.
+    SGLANG_DG_CACHE_DIR: "/configs/dsv4/gb300/deepgemm-cache"
+    FLASHINFER_WORKSPACE_BASE: "/configs/dsv4/gb300/flashinfer-cache"
+
+  sglang_config:
+    aggregated:
+      served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+      model-path: "/model/"
+      trust-remote-code: true
+
+      tensor-parallel-size: 4
+
+      # No MTP - pure MXFP4 MoE baseline
+      moe-runner-backend: "flashinfer_mxfp4"
+      chunked-prefill-size: 4096
+      disable-flashinfer-autotune: true
+      mem-fraction-static: 0.78
+
+      context-length: 2200
+      cuda-graph-max-bs: 1024
+      max-running-requests: 1024
+      disable-radix-cache: true
+      decode-log-interval: 1
+      stream-interval: 50
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  random_range_ratio: 0.8
+  concurrencies: "1x2x4x8x16x32x64x128x256x512x1024"
+  req_rate: "inf"
diff --git a/srtslurm.yaml.example b/srtslurm.yaml.example
index 3a1462f0..12e34c1b 100644
--- a/srtslurm.yaml.example
+++ b/srtslurm.yaml.example
@@ -24,11 +24,21 @@ containers:
   sglang-latest: "/shared/containers/sglang-v0.4.sqsh"
   sglang-dev: "/shared/containers/sglang-dev.sqsh"
   sglang-fp4: "/shared/containers/sglang-fp4.sqsh"
+  # DeepSeek-V4 cookbook images (pull via enroot from Docker Hub lmsysorg/sglang):
+  #   lmsysorg/sglang:deepseek-v4-blackwell         (B200 / x86_64)
+  #   lmsysorg/sglang:deepseek-v4-grace-blackwell   (GB200 / GB300 / aarch64)
+  #   lmsysorg/sglang:deepseek-v4-hopper            (H200)
+  dsv4-blackwell: "/shared/containers/sglang-deepseek-v4-blackwell.sqsh"
+  dsv4-grace-blackwell: "/shared/containers/sglang-deepseek-v4-grace-blackwell.sqsh"
+  dsv4-hopper: "/shared/containers/sglang-deepseek-v4-hopper.sqsh"
 
 # Model path aliases
 model_paths:
   deepseek-r1: "/shared/models/deepseek/DeepSeek-R1"
   deepseek-r1-distill: "/shared/models/deepseek/DeepSeek-R1-Distill-Qwen-32B"
+  # DeepSeek-V4 checkpoints (huggingface.co/deepseek-ai/DeepSeek-V4-Pro, -V4-Flash):
+  dsv4-pro: "/shared/models/deepseek/DeepSeek-V4-Pro"
+  dsv4-flash: "/shared/models/deepseek/DeepSeek-V4-Flash"
   llama-3-70b: "/shared/models/meta/llama-3-70b"
   llama-3-405b: "/shared/models/meta/llama-3-405b"