From 1153beaebb671d3174cef398e9b4d14e582e9719 Mon Sep 17 00:00:00 2001
From: Weiliang Liu <weiliangl@nvidia.com>
Date: Mon, 27 Apr 2026 04:03:34 -0700
Subject: [PATCH 1/3] Add DSV4 Pro GB300 high-throughput recipe

---
 recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 157 +++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 recipes/dsv4-pro/sglang/gb300-fp4/all.yaml

diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
new file mode 100644
index 00000000..1ce2fdf8
--- /dev/null
+++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
@@ -0,0 +1,157 @@
+base:
+  name: "dsv4-pro-gb300-fp4"
+
+  slurm:
+    partition: gb300
+    time_limit: "03:00:00"
+
+  frontend:
+    type: sglang
+    enable_multiple_frontends: false
+    args:
+      policy: "cache_aware"
+
+  model:
+    path: "dsv4-pro"
+    container: "dsv4-grace-blackwell"
+    precision: "fp4"
+
+  resources:
+    gpu_type: "gb300"
+    gpus_per_node: 4
+    prefill_nodes: 1
+    prefill_workers: 1
+    decode_nodes: 2
+    decode_workers: 1
+
+  backend:
+    type: sglang
+
+    prefill_environment:
+      PYTHONUNBUFFERED: "1"
+      SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+      SGLANG_ENABLE_THINKING: "1"
+      SGLANG_REASONING_EFFORT: "max"
+      SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+      SGLANG_OPT_USE_JIT_NORM: "1"
+      SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+      SGLANG_OPT_USE_TOPK_V2: "1"
+      SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
+      SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+      SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+      SGLANG_OPT_USE_FAST_MASK_EP: "1"
+      SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+      SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+      SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      MC_FORCE_MNNVL: "1"
+
+
+    decode_environment:
+      PYTHONUNBUFFERED: "1"
+      SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
+      SGLANG_ENABLE_THINKING: "1"
+      SGLANG_REASONING_EFFORT: "max"
+      SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+      SGLANG_OPT_USE_JIT_NORM: "1"
+      SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
+      SGLANG_OPT_USE_TOPK_V2: "1"
+      SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
+      SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
+      SGLANG_OPT_USE_FAST_MASK_EP: "1"
+      SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
+      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+      SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
+      SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
+      NCCL_MNNVL_ENABLE: "1"
+      NCCL_CUMEM_ENABLE: "1"
+      SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+      MC_FORCE_MNNVL: "1"
+      # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
+      # is single-node only and corrupts results in 2-node decode setups.
+
+    sglang_config:
+      prefill:
+        served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+        model-path: "/model/"
+        trust-remote-code: true
+
+        # Parallel
+        tensor-parallel-size: 4
+        data-parallel-size: 4
+        expert-parallel-size: 4
+
+        enable-dp-attention: true
+        moe-a2a-backend: "deepep"
+        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+        disaggregation-mode: "prefill"
+        disaggregation-transfer-backend: mooncake
+
+        mem-fraction-static: 0.90
+        max-running-requests: 512
+        cuda-graph-max-bs: 512
+        chunked-prefill-size: 65536
+        disable-radix-cache: true
+
+      decode:
+        served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
+        model-path: "/model/"
+        trust-remote-code: true
+        disable-radix-cache: true
+
+        disaggregation-mode: "decode"
+        disaggregation-transfer-backend: mooncake
+
+        # Decode: DEP8 (2 nodes)
+        tensor-parallel-size: 8
+        data-parallel-size:   8
+        expert-parallel-size: 8
+
+        # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x
+        # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture
+        # at cgmb=2048.
+        mem-fraction-static: 0.83
+        max-running-requests: 4096
+        cuda-graph-max-bs:    4096
+        swa-full-tokens-ratio: 0.1
+        context-length: 16384
+
+  benchmark:
+    type: "sa-bench"
+    concurrencies: "8192"
+    use_chat_template: false
+
+############ 8k1k ##############
+# [0]is wideep, [1] is narrow ep
+zip_override_8k1k_hightpt:
+  resources:
+    prefill_nodes:   [5, 1]
+    prefill_workers: [5, 1]
+    decode_nodes:    [8, 2]
+    decode_workers:  [1, 1]
+  backend:
+    sglang_config:
+      decode:
+        tensor-parallel-size:     [32, 8]
+        data-parallel-size:       [32, 8]
+        expert-parallel-size:     [32, 8]
+
+        enable-dp-attention: true
+        ep-num-redundant-experts: [32, null]
+        ep-dispatch-algorithm: ["static", null]
+        moe-a2a-backend: "deepep"
+        deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+        max-running-requests: [2048, 256]
+        cuda-graph-max-bs:    [2048, 256]
+
+  benchmark:
+    isl: 8192
+    osl: 1024
+    concurrencies: "4096"
+
+############ 1k1k ###############

From 0e1608accd5394e9f877615df13e685100a6b0ab Mon Sep 17 00:00:00 2001
From: Weiliang Liu <weiliangl@nvidia.com>
Date: Mon, 27 Apr 2026 07:21:22 -0700
Subject: [PATCH 2/3] fix wideep oom

---
 recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
index 1ce2fdf8..a6326fff 100644
--- a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
+++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
@@ -114,7 +114,7 @@ base:
         # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x
         # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture
         # at cgmb=2048.
-        mem-fraction-static: 0.83
+        mem-fraction-static: 0.75
         max-running-requests: 4096
         cuda-graph-max-bs:    4096
         swa-full-tokens-ratio: 0.1
@@ -141,6 +141,7 @@ zip_override_8k1k_hightpt:
         expert-parallel-size:     [32, 8]
 
         enable-dp-attention: true
+        enable-dp-lm-head: true
         ep-num-redundant-experts: [32, null]
         ep-dispatch-algorithm: ["static", null]
         moe-a2a-backend: "deepep"

From 051024e73f258bf9b8019c17c6ca79b51a976e2b Mon Sep 17 00:00:00 2001
From: Weiliang Liu <weiliangl@nvidia.com>
Date: Mon, 27 Apr 2026 08:25:54 -0700
Subject: [PATCH 3/3] optimize perf

---
 recipes/dsv4-pro/sglang/gb300-fp4/all.yaml | 47 ++++++++++++++--------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
index a6326fff..a948593c 100644
--- a/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
+++ b/recipes/dsv4-pro/sglang/gb300-fp4/all.yaml
@@ -24,6 +24,11 @@ base:
     decode_nodes: 2
     decode_workers: 1
 
+  # extra_mount:
+  # - /mnt/home/weiliang/project/sglang:/sgl-workspace/sglang
+
+  # setup_script: "install_sglang.sh"
+
   backend:
     type: sglang
 
@@ -33,6 +38,7 @@ base:
       SGLANG_ENABLE_THINKING: "1"
       SGLANG_REASONING_EFFORT: "max"
       SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+      SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
       SGLANG_OPT_USE_JIT_NORM: "1"
       SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
       SGLANG_OPT_USE_TOPK_V2: "1"
@@ -41,14 +47,16 @@ base:
       SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
       SGLANG_OPT_USE_FAST_MASK_EP: "1"
       SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
       SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
       SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
       NCCL_MNNVL_ENABLE: "1"
       NCCL_CUMEM_ENABLE: "1"
       SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
       MC_FORCE_MNNVL: "1"
-
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+      SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
 
     decode_environment:
       PYTHONUNBUFFERED: "1"
@@ -56,6 +64,7 @@ base:
       SGLANG_ENABLE_THINKING: "1"
       SGLANG_REASONING_EFFORT: "max"
       SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
+      SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
       SGLANG_OPT_USE_JIT_NORM: "1"
       SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
       SGLANG_OPT_USE_TOPK_V2: "1"
@@ -63,13 +72,16 @@ base:
       SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
       SGLANG_OPT_USE_FAST_MASK_EP: "1"
       SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
-      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096"
+      SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "576"
       SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
       SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"
       NCCL_MNNVL_ENABLE: "1"
       NCCL_CUMEM_ENABLE: "1"
       SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
       MC_FORCE_MNNVL: "1"
+      SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+      SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" 
+      SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
       # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
       # is single-node only and corrupts results in 2-node decode setups.
 
@@ -94,7 +106,7 @@ base:
         mem-fraction-static: 0.90
         max-running-requests: 512
         cuda-graph-max-bs: 512
-        chunked-prefill-size: 65536
+        chunked-prefill-size: 32768
         disable-radix-cache: true
 
       decode:
@@ -114,9 +126,9 @@ base:
         # Lower mfs on decode: DEP8 weight memory + KV pool both grow ~2x
         # vs DEP4, so 0.83 leaves enough headroom for cuda-graph capture
         # at cgmb=2048.
-        mem-fraction-static: 0.75
-        max-running-requests: 4096
-        cuda-graph-max-bs:    4096
+        mem-fraction-static: 0.9
+        max-running-requests: 8192
+        cuda-graph-max-bs:    8192
         swa-full-tokens-ratio: 0.1
         context-length: 16384
 
@@ -129,30 +141,31 @@ base:
 # [0]is wideep, [1] is narrow ep
 zip_override_8k1k_hightpt:
   resources:
-    prefill_nodes:   [5, 1]
-    prefill_workers: [5, 1]
-    decode_nodes:    [8, 2]
+    prefill_nodes:   [14, 1]
+    prefill_workers: [14, 1]
+    decode_nodes:    [4, 2]
     decode_workers:  [1, 1]
   backend:
     sglang_config:
       decode:
-        tensor-parallel-size:     [32, 8]
-        data-parallel-size:       [32, 8]
-        expert-parallel-size:     [32, 8]
+        tensor-parallel-size:     [16, 8]
+        data-parallel-size:       [16, 8]
+        expert-parallel-size:     [16, 8]
 
         enable-dp-attention: true
         enable-dp-lm-head: true
-        ep-num-redundant-experts: [32, null]
+
+        ep-num-redundant-experts: [16, null]
         ep-dispatch-algorithm: ["static", null]
         moe-a2a-backend: "deepep"
         deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
 
-        max-running-requests: [2048, 256]
-        cuda-graph-max-bs:    [2048, 256]
+        max-running-requests: [9216, 256]
+        cuda-graph-max-bs:    [576, 32]
 
   benchmark:
     isl: 8192
     osl: 1024
-    concurrencies: "4096"
+    concurrencies: "8192"
 
 ############ 1k1k ###############