ishandhanani · ishandhanani · Feb 4, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 3, 2026
diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml
@@ -1,8 +1,17 @@
-name: "gb200-fp4-1p2d"
+name: "gb200-fp4-1k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -25,14 +34,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
@@ -44,14 +50,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   sglang_config:
     prefill:
@@ -76,6 +79,8 @@ backend:
       moe-dense-tp-size: 1
       load-balance-method: "round_robin"
       disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
@@ -100,12 +105,14 @@ backend:
       scheduler-recv-interval: 10
       enable-symm-mem: true
       moe-dense-tp-size: 1
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       tensor-parallel-size: 4
       expert-parallel-size: 1
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x32x64x112x128x256"
-  req_rate: "inf"
+  concurrencies: "4x8x32"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml
@@ -1,10 +1,17 @@
-# 4P1D, with 12 Decode Nodes. Uses single batch overlap
+name: "gb200-fp4-1k1k-max-tpt"
 
-name: "gb200-fp4-max-tpt"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -27,7 +34,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -46,7 +52,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -56,8 +61,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -103,6 +106,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -162,6 +167,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48
@@ -172,5 +179,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1x128x512x2048x4096x8192x12000x15000"
-  req_rate: "inf"
+  concurrencies: "512x2048x4096"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml
@@ -1,11 +1,17 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
-# per gpu throughput
+name: "gb200-fp4-1k1k-mid-curve"
 
-name: "gb200-fp4-max-tpt-2"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -28,7 +34,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -47,7 +52,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -57,8 +61,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -104,6 +106,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -162,6 +166,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32
@@ -172,5 +178,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1x128x512x2048x4096x8192x12000x15000"
-  req_rate: "inf"
+  concurrencies: "512x2048x4096x8192x15000"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml
@@ -1,16 +1,17 @@
-name: "gb200-fp4-1p2d"
+name: "gb200-fp4-1k8k-low-latency"
 
 dynamo:
-  version: 0.7.0
+  version: 0.8.1
 
-frontend:
-  type: dynamo
+frontend:  
+  type: dynamo  
   enable_multiple_frontends: true
-  num_additional_frontends: 4
+  num_additional_frontends: 3
+  nginx_container: nginx
 
 model:
-  path: "dsr1"
-  container: "lmsysorg/sglang:v0.5.5.post2"
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -37,7 +38,6 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
@@ -54,12 +54,11 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
 
   sglang_config:
     prefill:
-      disaggregation-mode: "prefill"
       served-model-name: "deepseek-ai/DeepSeek-R1"
+      disaggregation-mode: "prefill"
       trust-remote-code: true
       disable-radix-cache: true
       kv-cache-dtype: "fp8_e4m3"
@@ -81,10 +80,12 @@ backend:
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
 
     decode:
-      disaggregation-mode: "decode"
       served-model-name: "deepseek-ai/DeepSeek-R1"
+      disaggregation-mode: "decode"
       prefill-round-robin-balance: true
       trust-remote-code: true
       disable-radix-cache: true
@@ -103,6 +104,8 @@ backend:
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"

diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml
@@ -1,16 +1,17 @@
-name: "gb200-fp4-max-tpt"
+name: "gb200-fp4-1k8k-max-tpt"
 
 dynamo:
-  version: 0.7.0
+  version: 0.8.1
 
 frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
-  path: "dsr1"
-  container: "lmsysorg/sglang:v0.5.5.post2"
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -32,7 +33,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -51,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -61,14 +60,14 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
+      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -108,6 +107,7 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 4
@@ -118,6 +118,7 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
+      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -233,6 +234,7 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48
@@ -243,5 +245,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 8192
-  concurrencies: "256x512x1024x2048x8192"
+  concurrencies: "256x512x1024x2048"
   req_rate: "inf"