ishandhanani · ishandhanani · Jan 27, 2026 · Jan 27, 2026 · coderabbitai · Jan 27, 2026
diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml
@@ -1,8 +1,16 @@
 name: "gb200-fp4-1p2d"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -24,8 +32,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -43,8 +49,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -64,7 +68,7 @@ backend:
       moe-runner-backend: "flashinfer_trtllm"
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       max-total-tokens: 8192
       chunked-prefill-size: 8192
-      context-length: 10000
-      mem-fraction-static: 0.95
-      max-total-tokens: 8192
-      chunked-prefill-size: 8192
+      context-length: 10000
+      mem-fraction-static: 0.95
+      max-total-tokens: 10000
+      chunked-prefill-size: 8192
-      context-length: 10000
-      mem-fraction-static: 0.95
-      max-total-tokens: 8192
-      chunked-prefill-size: 8192
+      context-length: 10000
+      mem-fraction-static: 0.95
+      max-total-tokens: 10000
+      chunked-prefill-size: 8192
@@ -77,7 +81,6 @@ backend:
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
     decode:
       disaggregation-mode: "decode"
@@ -92,15 +95,14 @@ backend:
       disaggregation-bootstrap-port: 30001
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       chunked-prefill-size: 8192
       cuda-graph-max-bs: 256
       scheduler-recv-interval: 10
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"

diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml
@@ -1,10 +1,16 @@
-# 4P1D, with 12 Decode Nodes. Uses single batch overlap
-
 name: "gb200-fp4-max-tpt"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -56,13 +62,13 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -80,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -112,7 +118,6 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -130,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +233,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48

diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml
@@ -1,11 +1,16 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
-# per gpu throughput
+name: "gb200-fp4-mid-curve"
 
-name: "gb200-fp4-max-tpt-2"
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -57,6 +62,7 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -67,7 +73,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -81,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -117,7 +122,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -131,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +232,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32