diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
index ab71d0a4..0f219d07 100644
--- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
+++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -135,5 +135,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "16x128x512"
+  concurrencies: "16x128"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
index 2ac9fbeb..55347d69 100644
--- a/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
+++ b/recipes/b200-fp4/1k1k/stp/low-latency-dep4-1p-tep8-6d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-6d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -135,5 +135,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "32x64x256x512"
+  concurrencies: "32x64x256"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
index 077bc0d9..7e617cb2 100644
--- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
+++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-1d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-max-tpt-dep4-1p-dep8-1d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -139,5 +139,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "512x1024"
+  concurrencies: "512"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
index 890e35ed..51051ce4 100644
--- a/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
+++ b/recipes/b200-fp4/1k1k/stp/max-tpt-dep4-1p-dep8-2d.yaml
@@ -1,7 +1,7 @@
 name: "b200-fp4-max-tpt-dep4-1p-dep8-2d"
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
index 12f9adab..03a930d5 100644
--- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-1d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-1d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
index 5cd343fe..ca4684d7 100644
--- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-1p-tep8-5d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-1p-tep8-5d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -135,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "32"
+  concurrencies: "8"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
index 502cc023..450fbcba 100644
--- a/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-dep4-2p-tep8-5d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-low-latency-dep4-2p-tep8-5d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -89,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -126,6 +135,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -135,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "64x128"
+  concurrencies: "4x128"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
similarity index 83%
rename from recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml
rename to recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
index 7331d670..f1e3c39f 100644
--- a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-4p-dep8-1d.yaml
+++ b/recipes/b200-fp4/8k1k/stp/low-latency-tp4-1p-tp8-1d.yaml
@@ -1,14 +1,22 @@
-name: "b200-fp4-max-tpt-dep4-4p-dep8-1d"
+name: "b200-fp4-low-latency-tp4-1p-tp8-1d"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
 
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
 resources:
   gpu_type: "b200"
-  prefill_nodes: 4
-  prefill_workers: 4
+  prefill_nodes: 1
+  prefill_workers: 1
   gpus_per_prefill: 4
   decode_nodes: 1
   decode_workers: 1
@@ -48,8 +56,6 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
-    SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
     DYN_REQUEST_PLANE: nats
 
   sglang_config:
@@ -68,15 +74,15 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
+      max-running-requests: 512
       disable-cuda-graph: true
 
       # Parallelism
       tensor-parallel-size: 4
-      data-parallel-size: 4
-      expert-parallel-size: 4
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      data-parallel-size: 1
+      expert-parallel-size: 1
+#      enable-dp-attention: false
+#      enable-dp-lm-head: true
 
       # Attention
       attention-backend: "trtllm_mla"
@@ -91,6 +97,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
     decode:
       # Model configuration
@@ -107,15 +114,13 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
-      cuda-graph-max-bs: 1024
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
 
       # Parallelism
       tensor-parallel-size: 8
-      data-parallel-size: 8
-      expert-parallel-size: 8
-      enable-dp-attention: true
-      enable-dp-lm-head: true
+      data-parallel-size: 1
+      expert-parallel-size: 1
 
       # Attention
       attention-backend: "trtllm_mla"
@@ -123,13 +128,14 @@ backend:
 
       # MoE
       moe-runner-backend: "flashinfer_trtllm"
-      moe-dense-tp-size: 1
+      # moe-dense-tp-size: 1
 
       # Other flags
       stream-interval: 30
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_trtllm"
 
 health_check:
   max_attempts: 360
@@ -139,5 +145,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "4x8x16x64"
   req_rate: "inf"
diff --git a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
index 0d645ec0..a9f0d01e 100644
--- a/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
+++ b/recipes/b200-fp4/8k1k/stp/max-tpt-dep4-7p-dep8-2d.yaml
@@ -1,7 +1,15 @@
 name: "b200-fp4-max-tpt-dep4-7p-dep8-2d"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
-  path: "dsr1-fp4"
+  path: "dsr1"
   container: "dynamo-sglang"
   precision: "fp4"
 
@@ -49,7 +57,6 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
     DYN_REQUEST_PLANE: nats
 
   sglang_config:
@@ -65,8 +72,8 @@ backend:
 
       # Memory and token limits
       mem-fraction-static: 0.85
-      max-prefill-tokens: 32768
-      chunked-prefill-size: 32768
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 65536
       context-length: 9600
       max-running-requests: 1024
       disable-cuda-graph: true
@@ -91,6 +98,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
     decode:
       # Model configuration
@@ -107,7 +115,7 @@ backend:
       max-prefill-tokens: 32768
       chunked-prefill-size: 32768
       context-length: 9600
-      max-running-requests: 1024
+      max-running-requests: 2048
       cuda-graph-max-bs: 1024
 
       # Parallelism
@@ -130,6 +138,7 @@ backend:
       watchdog-timeout: 1000000
       enable-flashinfer-allreduce-fusion: true
       disable-radix-cache: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
 health_check:
   max_attempts: 360
@@ -139,5 +148,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024"
+  concurrencies: "1024x2048"
   req_rate: "inf"