diff --git a/.gitignore b/.gitignore
index 5a522a4d..fb5fbd86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,4 +51,8 @@ configs/*.tar.gz
 .ruff_cache/
 *.egg-info/
 
-.coverage
\ No newline at end of file
+.coverage
+
+configs/dg-*
+configs/flashinfer-cache/
+outputs/*
diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml
index 119be5ca..6c2a9536 100644
--- a/recipies/gb200-fp4/1k8k/low-latency.yaml
+++ b/recipies/gb200-fp4/1k8k/low-latency.yaml
@@ -1,8 +1,16 @@
 name: "gb200-fp4-1p2d"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -24,8 +32,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -43,8 +49,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -64,7 +68,7 @@ backend:
       moe-runner-backend: "flashinfer_trtllm"
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       max-total-tokens: 8192
       chunked-prefill-size: 8192
@@ -77,7 +81,6 @@ backend:
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
     decode:
       disaggregation-mode: "decode"
@@ -92,7 +95,7 @@ backend:
       disaggregation-bootstrap-port: 30001
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       chunked-prefill-size: 8192
       cuda-graph-max-bs: 256
@@ -100,7 +103,6 @@ backend:
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"
diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml
index f6ad6141..d2c46140 100644
--- a/recipies/gb200-fp4/1k8k/max-tpt.yaml
+++ b/recipies/gb200-fp4/1k8k/max-tpt.yaml
@@ -1,10 +1,16 @@
-# 4P1D, with 12 Decode Nodes. Uses single batch overlap
-
 name: "gb200-fp4-max-tpt"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -56,13 +62,13 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -80,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -112,7 +118,6 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -130,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +233,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48
diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml
index bd5f8a23..bf455b72 100644
--- a/recipies/gb200-fp4/1k8k/mid-curve.yaml
+++ b/recipies/gb200-fp4/1k8k/mid-curve.yaml
@@ -1,11 +1,16 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
-# per gpu throughput
+name: "gb200-fp4-mid-curve"
 
-name: "gb200-fp4-max-tpt-2"
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -57,6 +62,7 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -67,7 +73,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -81,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -117,7 +122,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -131,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +232,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32
diff --git a/recipies/h200/1k1k/bs128-agg-tp.yaml b/recipies/h200/1k1k/bs128-agg-tp.yaml
new file mode 100644
index 00000000..c036f948
--- /dev/null
+++ b/recipies/h200/1k1k/bs128-agg-tp.yaml
@@ -0,0 +1,59 @@
+name: "agg-tp-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    aggregated:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 10
+      max-running-requests: 512  # sum of all dp
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # CUDA graphs
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x16x32x64x128x256x512"
+  req_rate: "inf"
diff --git a/recipies/h200/1k1k/bs256-1p6d-dep.yaml b/recipies/h200/1k1k/bs256-1p6d-dep.yaml
new file mode 100644
index 00000000..76b58665
--- /dev/null
+++ b/recipies/h200/1k1k/bs256-1p6d-dep.yaml
@@ -0,0 +1,100 @@
+name: "bs256-1p6d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 512
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 262144
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512x1024x2048"
+  req_rate: "inf"
+
diff --git a/recipies/h200/1k1k/bs256-1p6d-tp.yaml b/recipies/h200/1k1k/bs256-1p6d-tp.yaml
new file mode 100644
index 00000000..1214d55b
--- /dev/null
+++ b/recipies/h200/1k1k/bs256-1p6d-tp.yaml
@@ -0,0 +1,100 @@
+name: "bs256-1p6d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 512
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.7
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  # concurrencies: "128x256x512"
+  concurrencies: "512x1024x2048"
+  req_rate: "inf"
+
diff --git a/recipies/h200/1k1k/low-latency-1p9d.yaml b/recipies/h200/1k1k/low-latency-1p9d.yaml
new file mode 100644
index 00000000..5e88422b
--- /dev/null
+++ b/recipies/h200/1k1k/low-latency-1p9d.yaml
@@ -0,0 +1,97 @@
+name: "low-latency-1p9d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 9
+  decode_workers: 9
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 256
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x8x16x32x64x128x256"
+  req_rate: "inf"
diff --git a/recipies/h200/8k1k/bs128-1p1d-dep.yaml b/recipies/h200/8k1k/bs128-1p1d-dep.yaml
new file mode 100644
index 00000000..1a08a8ca
--- /dev/null
+++ b/recipies/h200/8k1k/bs128-1p1d-dep.yaml
@@ -0,0 +1,100 @@
+name: "bs128-1p1d-dep-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 1
+  decode_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 163840
+      chunked-prefill-size: 163840
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.88
+      max-running-requests: 256
+      cuda-graph-max-bs: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "64x128x256"
+  req_rate: "inf"
+
diff --git a/recipies/h200/8k1k/bs128-agg-tp.yaml b/recipies/h200/8k1k/bs128-agg-tp.yaml
new file mode 100644
index 00000000..9191f8a7
--- /dev/null
+++ b/recipies/h200/8k1k/bs128-agg-tp.yaml
@@ -0,0 +1,60 @@
+name: "agg-tp-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    aggregated:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 10
+      max-running-requests: 256  # sum of all dp
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # CUDA graphs
+      cuda-graph-max-bs: 256
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x16x32x64x128x256"
+  req_rate: "inf"
+
diff --git a/recipies/h200/8k1k/bs16-1p3d.yaml b/recipies/h200/8k1k/bs16-1p3d.yaml
new file mode 100644
index 00000000..95f756dd
--- /dev/null
+++ b/recipies/h200/8k1k/bs16-1p3d.yaml
@@ -0,0 +1,98 @@
+name: "bs16-1p3d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 32
+      cuda-graph-max-bs: 32
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "8x16x32"
+  req_rate: "inf"
+
diff --git a/recipies/h200/8k1k/bs4-1p7d.yaml b/recipies/h200/8k1k/bs4-1p7d.yaml
new file mode 100644
index 00000000..75fe19f9
--- /dev/null
+++ b/recipies/h200/8k1k/bs4-1p7d.yaml
@@ -0,0 +1,98 @@
+name: "bs4-1p7d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 7
+  decode_workers: 7
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 8
+      cuda-graph-max-bs: 8
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "1x4x8"
+  req_rate: "inf"
+
diff --git a/recipies/h200/8k1k/bs64-2p3d.yaml b/recipies/h200/8k1k/bs64-2p3d.yaml
new file mode 100644
index 00000000..23b1a3d9
--- /dev/null
+++ b/recipies/h200/8k1k/bs64-2p3d.yaml
@@ -0,0 +1,106 @@
+name: "bs64-2p3d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 2
+  prefill_workers: 2
+  decode_nodes: 3
+  decode_workers: 3
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      context-length: 72000
+      max-total-tokens: 128000 
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 128
+      cuda-graph-max-bs: 128
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "32x64x128"
+  req_rate: "inf"
+
+# benchmark:
+#   type: "gpqa"
+#   num_examples: 198
+#   repeat: 4
+#   num_threads: 32
+#   max_tokens: 64000
\ No newline at end of file
diff --git a/recipies/h200/8k1k/bs8-1p6d.yaml b/recipies/h200/8k1k/bs8-1p6d.yaml
new file mode 100644
index 00000000..8b18c479
--- /dev/null
+++ b/recipies/h200/8k1k/bs8-1p6d.yaml
@@ -0,0 +1,99 @@
+name: "bs8-1p6d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1 
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 16
+      
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+      ep-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 16
+      cuda-graph-max-bs: 16
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192
+  osl: 1024
+  concurrencies: "4x8x16"
+  req_rate: "inf"
+
diff --git a/src/srtctl/backends/trtllm.py b/src/srtctl/backends/trtllm.py
index 25f1673c..2553860f 100644
--- a/src/srtctl/backends/trtllm.py
+++ b/src/srtctl/backends/trtllm.py
@@ -16,6 +16,7 @@
 # Type alias for worker modes
 WorkerMode = Literal["prefill", "decode", "agg"]
 
+
 @dataclass(frozen=True)
 class TRTLLMServerConfig:
     """SGLang server CLI configuration per mode (prefill/decode/aggregated).
@@ -30,6 +31,7 @@ class TRTLLMServerConfig:
 
     Schema: ClassVar[type[Schema]] = Schema
 
+
 @dataclass(frozen=True)
 class TRTLLMProtocol:
     """TRTLLM protocol - implements BackendProtocol.
@@ -169,7 +171,7 @@ def build_worker_command(
             "--extra-engine-args",
             str(container_config_path),
             "--request-plane",
-            "nats"
+            "nats",
         ]
 
         return cmd
diff --git a/src/srtctl/cli/mixins/frontend_stage.py b/src/srtctl/cli/mixins/frontend_stage.py
index 17dd33a2..72111c6b 100644
--- a/src/srtctl/cli/mixins/frontend_stage.py
+++ b/src/srtctl/cli/mixins/frontend_stage.py
@@ -146,6 +146,9 @@ def _start_nginx(self, topology: FrontendTopology) -> ManagedProcess:
             container_image=str(self.runtime.container_image),
             container_mounts=self.runtime.container_mounts,
             use_bash_wrapper=False,  # Already wrapped in bash -c
+            srun_options={
+                "container-remap-root": "",
+            },
         )
 
         return ManagedProcess(
diff --git a/src/srtctl/cli/mixins/worker_stage.py b/src/srtctl/cli/mixins/worker_stage.py
index bbae37ae..fc4347e9 100644
--- a/src/srtctl/cli/mixins/worker_stage.py
+++ b/src/srtctl/cli/mixins/worker_stage.py
@@ -137,7 +137,6 @@ def __missing__(self, key: str) -> str:
             formatted_value = value.format_map(SafeDict(template_vars))
             env_to_set[key] = formatted_value
 
-
         # Add profiling environment variables
         if profiling.enabled:
             profile_dir = str(self.runtime.log_dir / "profiles")
diff --git a/src/srtctl/core/schema.py b/src/srtctl/core/schema.py
index eabdc5c6..7e338f0c 100644
--- a/src/srtctl/core/schema.py
+++ b/src/srtctl/core/schema.py
@@ -585,7 +585,7 @@ def get_install_commands(self) -> str:
         if self.version is not None:
             return (
                 f"echo 'Installing dynamo {self.version}...' && "
-                f"pip install --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && "
+                f"pip install --break-system-packages --quiet ai-dynamo-runtime=={self.version} ai-dynamo=={self.version} && "
                 f"echo 'Dynamo {self.version} installed'"
             )
 
@@ -600,7 +600,7 @@ def get_install_commands(self) -> str:
             "cd dynamo && "
             f"{checkout_cmd + ' && ' if checkout_cmd else ''}"
             "cd lib/bindings/python/ && "
-            "export RUSTFLAGS=\"${RUSTFLAGS:-} -C target-cpu=native\" && "
+            'export RUSTFLAGS="${RUSTFLAGS:-} -C target-cpu=native" && '
             "maturin build -o /tmp && "
             "pip install /tmp/ai_dynamo_runtime*.whl && "
             "cd /sgl-workspace/dynamo/ && "