From 67445264f5f28452dbd692e669e5f07fb12c1f36 Mon Sep 17 00:00:00 2001
From: Nicolas Castet <ncastet@nvidia.com>
Date: Fri, 23 Jan 2026 15:47:01 -0600
Subject: [PATCH] Fix recipes for sglang v0.5.5

---
 recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml |  2 ++
 recipies/gb200-fp4/1k1k/max-tpt.yaml     |  1 +
 recipies/gb200-fp4/1k1k/mid-curve.yaml   |  3 +-
 recipies/gb200-fp4/1k8k/max-tpt.yaml     |  1 +
 recipies/gb200-fp4/1k8k/mid-curve.yaml   |  1 +
 recipies/gb200-fp4/8k1k/max-tpt.yaml     | 35 ++++++++++++------------
 recipies/gb200-fp4/8k1k/mid-curve.yaml   | 31 +++++++++++----------
 7 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml b/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml
index 39fccc69..d32da609 100644
--- a/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml
+++ b/recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml
@@ -38,6 +38,7 @@ backend:
     SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1"
     SGLANG_ENABLE_SPEC_V2: "1"
     SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
@@ -65,6 +66,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
     SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1"
     SGLANG_ENABLE_SPEC_V2: "1"
diff --git a/recipies/gb200-fp4/1k1k/max-tpt.yaml b/recipies/gb200-fp4/1k1k/max-tpt.yaml
index 80cb66d9..0cabd2cb 100644
--- a/recipies/gb200-fp4/1k1k/max-tpt.yaml
+++ b/recipies/gb200-fp4/1k1k/max-tpt.yaml
@@ -56,6 +56,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
diff --git a/recipies/gb200-fp4/1k1k/mid-curve.yaml b/recipies/gb200-fp4/1k1k/mid-curve.yaml
index 6405b5e0..36eef7ab 100644
--- a/recipies/gb200-fp4/1k1k/mid-curve.yaml
+++ b/recipies/gb200-fp4/1k1k/mid-curve.yaml
@@ -1,4 +1,4 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher 
+# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
 # per gpu throughput
 
 name: "gb200-fp4-max-tpt-2"
@@ -57,6 +57,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml
index 97f0f3f1..fe716c81 100644
--- a/recipies/gb200-fp4/1k8k/max-tpt.yaml
+++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml
@@ -55,6 +55,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
 
   sglang_config:
     prefill:
diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml
index 1e5771f2..78296911 100644
--- a/recipies/gb200-fp4/1k8k/mid-curve.yaml
+++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml
@@ -56,6 +56,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
 
   sglang_config:
     prefill:
diff --git a/recipies/gb200-fp4/8k1k/max-tpt.yaml b/recipies/gb200-fp4/8k1k/max-tpt.yaml
index d4f0fb00..e9bda3a4 100644
--- a/recipies/gb200-fp4/8k1k/max-tpt.yaml
+++ b/recipies/gb200-fp4/8k1k/max-tpt.yaml
@@ -7,9 +7,9 @@ model:
 
 resources:
   gpu_type: "gb200"
-  prefill_nodes: 10 
-  decode_nodes: 8 
-  prefill_workers: 10 
+  prefill_nodes: 10
+  decode_nodes: 8
+  prefill_workers: 10
   decode_workers: 1
   gpus_per_node: 4
 
@@ -54,6 +54,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
@@ -79,7 +80,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9600 
+      context-length: 9600
       disable-shared-experts-fusion: true
       disaggregation-bootstrap-port: 30001
 
@@ -89,8 +90,8 @@ backend:
       # Memory and token limits
       mem-fraction-static: 0.95
       max-total-tokens: 131072
-      max-prefill-tokens: 524288 
-      chunked-prefill-size: 131072 
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
 
       # Request handling
       max-running-requests: 30000
@@ -98,13 +99,13 @@ backend:
 
       # Performance optimizations
       disable-cuda-graph: true
-      enable-dp-attention: false 
+      enable-dp-attention: false
 
       # Parallelism
       tp-size: 4
       dp-size: 1
       ep-size: 1
- 
+
     decode:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
@@ -127,7 +128,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9600 
+      context-length: 9600
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -137,11 +138,11 @@ backend:
 
       # Memory and token limits
       mem-fraction-static: 0.83
-      max-total-tokens: 524288 
-      chunked-prefill-size: 24576 
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
 
       # Request handling
-      max-running-requests: 16384 
+      max-running-requests: 16384
 
       # DeepEP configuration
       moe-a2a-backend: "deepep"
@@ -159,13 +160,13 @@ backend:
       enable-dp-attention: true
 
       # Parallelism
-      tp-size: 32 
-      dp-size: 32 
-      ep-size: 32 
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
 
 benchmark:
   type: "sa-bench"
-  isl: 8192 
+  isl: 8192
   osl: 1024
   concurrencies: "1024x2048x8192"
-  req_rate: 700 
+  req_rate: 700
diff --git a/recipies/gb200-fp4/8k1k/mid-curve.yaml b/recipies/gb200-fp4/8k1k/mid-curve.yaml
index 58446851..1b0b3246 100644
--- a/recipies/gb200-fp4/8k1k/mid-curve.yaml
+++ b/recipies/gb200-fp4/8k1k/mid-curve.yaml
@@ -8,7 +8,7 @@ model:
 resources:
   gpu_type: "gb200"
   prefill_nodes: 6
-  decode_nodes: 12 
+  decode_nodes: 12
   prefill_workers: 6
   decode_workers: 1
   gpus_per_node: 4
@@ -54,6 +54,7 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
+    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
@@ -79,7 +80,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9600 
+      context-length: 9600
       disable-shared-experts-fusion: true
       disaggregation-bootstrap-port: 30001
 
@@ -89,8 +90,8 @@ backend:
       # Memory and token limits
       mem-fraction-static: 0.95
       max-total-tokens: 131072
-      max-prefill-tokens: 524288 
-      chunked-prefill-size: 131072 
+      max-prefill-tokens: 524288
+      chunked-prefill-size: 131072
 
       # Request handling
       max-running-requests: 30000
@@ -98,13 +99,13 @@ backend:
 
       # Performance optimizations
       disable-cuda-graph: true
-      enable-dp-attention: false 
+      enable-dp-attention: false
 
       # Parallelism
       tp-size: 4
       dp-size: 1
       ep-size: 1
- 
+
     decode:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
@@ -127,7 +128,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9600 
+      context-length: 9600
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -137,11 +138,11 @@ backend:
 
       # Memory and token limits
       mem-fraction-static: 0.83
-      max-total-tokens: 524288 
-      chunked-prefill-size: 24576 
+      max-total-tokens: 524288
+      chunked-prefill-size: 24576
 
       # Request handling
-      max-running-requests: 16384 
+      max-running-requests: 16384
 
       # DeepEP configuration
       moe-a2a-backend: "deepep"
@@ -159,13 +160,13 @@ backend:
       enable-dp-attention: true
 
       # Parallelism
-      tp-size: 48 
-      dp-size: 48 
-      ep-size: 48 
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
 
 benchmark:
   type: "sa-bench"
-  isl: 8192 
+  isl: 8192
   osl: 1024
   concurrencies: "512x1024x2048x4096"
-  req_rate: 700 
\ No newline at end of file
+  req_rate: 700
\ No newline at end of file