ishandhanani · ishandhanani · Feb 10, 2026 · Feb 10, 2026
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -135,5 +135,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "16x128x512"
+  concurrencies: "16x128"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-6d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -135,5 +135,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "32x64x256x512"
+  concurrencies: "32x64x256"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-max-tpt-dep4-1p-dep8-1d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -139,5 +139,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "512x1024"
+  concurrencies: "512"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-max-tpt-dep4-1p-dep8-2d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 

diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-1d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360

diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -135,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "32"
+  concurrencies: "8"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-2p-tep8-5d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -135,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "64x128"
+  concurrencies: "4x128"
   req_rate: "inf"
diff --git a/...fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml → ...4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml b/...fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml → ...4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
@@ -1,14 +1,22 @@
-name: "b200-fp4-max-tpt-dep4-4p-dep8-1d"
+name: "b200-fp4-low-latency-tp4-1p-tp8-1d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
 resources:
   gpu_type: "b200"
-  prefill_nodes: 4
-  prefill_workers: 4
+  prefill_nodes: 1
+  prefill_workers: 1
   gpus_per_prefill: 4
   decode_nodes: 1
   decode_workers: 1
@@ -48,8 +56,6 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
-    SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
     DYN_REQUEST_PLANE: nats
 
   sglang_config:
@@ -68,15 +74,15 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
+      max-running-requests: 512
       disable-cuda-graph: true
 
       # Parallelism
       tensor-parallel-size: 4
-      data-parallel-size: 4
-      expert-parallel-size: 4
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      data-parallel-size: 1
+      expert-parallel-size: 1
+#      enable-dp-attention: false
+#      enable-dp-lm-head: true
 
       # Attention
       attention-backend: "trtllm_mla"
@@ -91,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -107,29 +114,28 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
-      cuda-graph-max-bs: 1024
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
 
       # Parallelism
       tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      data-parallel-size: 1
+      expert-parallel-size: 1
 
       # Attention
       attention-backend: "trtllm_mla"
       kv-cache-dtype: "fp8_e4m3"
 
       # MoE
       moe-runner-backend: "flashinfer_trtllm"
-      moe-dense-tp-size: 1
+      # moe-dense-tp-size: 1
 
       # Other flags
       stream-interval: 30
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -139,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "4x8x16x64"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-max-tpt-dep4-7p-dep8-2d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -49,7 +57,6 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
     DYN_REQUEST_PLANE: nats
 
   sglang_config:
@@ -65,8 +72,8 @@ backend:
 
       # Memory and token limits
       mem-fraction-static: 0.85
-      max-prefill-tokens: 32768
-      chunked-prefill-size: 32768
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 65536
       context-length: 9600
       max-running-requests: 1024
       disable-cuda-graph: true
@@ -91,6 +98,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
     decode:
       # Model configuration
@@ -107,7 +115,7 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
+      max-running-requests: 2048
       cuda-graph-max-bs: 1024
 
       # Parallelism
@@ -130,6 +138,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
 health_check:
   max_attempts: 360
@@ -139,5 +148,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "1024x2048"
   req_rate: "inf"