From 0c90137c10f7c36d72a8ebe1eec20e0c342e022f Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Mon, 2 Feb 2026 14:31:25 -0800
Subject: [PATCH 1/6] Update GB200-FP8 configs

---
 recipes/gb200-fp8/1k1k/low-latency.yaml       |  38 ++--
 .../1k1k/{max-tpt-2p1d.yaml => max-tpt.yaml}  |  25 ++-
 .../{mid-curve-3p1d.yaml => mid-curve.yaml}   |  28 ++-
 recipes/gb200-fp8/1k8k/low_latency.yaml       | 124 ++++++++++++
 recipes/gb200-fp8/1k8k/max_tpt.yaml           | 182 ++++++++++++++++++
 recipes/gb200-fp8/1k8k/mid_curve.yaml         | 173 +++++++++++++++++
 recipes/gb200-fp8/8k1k/low-latency.yaml       |  41 ++--
 recipes/gb200-fp8/8k1k/max_tpt.yaml           | 174 +++++++++++++++++
 .../{mid-curve-5p1d.yaml => mid-curve.yaml}   |  23 ++-
 9 files changed, 753 insertions(+), 55 deletions(-)
 rename recipes/gb200-fp8/1k1k/{max-tpt-2p1d.yaml => max-tpt.yaml} (90%)
 rename recipes/gb200-fp8/1k1k/{mid-curve-3p1d.yaml => mid-curve.yaml} (90%)
 create mode 100644 recipes/gb200-fp8/1k8k/low_latency.yaml
 create mode 100644 recipes/gb200-fp8/1k8k/max_tpt.yaml
 create mode 100644 recipes/gb200-fp8/1k8k/mid_curve.yaml
 create mode 100644 recipes/gb200-fp8/8k1k/max_tpt.yaml
 rename recipes/gb200-fp8/8k1k/{mid-curve-5p1d.yaml => mid-curve.yaml} (89%)

diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml
index 7ce9daf3..11151232 100644
--- a/recipes/gb200-fp8/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/1k1k/low-latency.yaml
@@ -1,16 +1,24 @@
-name: "gb200-fp8-1p-4d-low-latency"
+name: "gb200-fp8-1k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
 
 model:
-  path: "dsfp8"
-  container: "0.5.5.post2"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
   gpu_type: "gb200"
   prefill_nodes: 1
-  decode_nodes: 4
+  decode_nodes: 1
   prefill_workers: 1
-  decode_workers: 4
+  decode_workers: 1
   gpus_per_node: 4
 
 backend:
@@ -18,9 +26,8 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
@@ -36,14 +43,13 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
@@ -72,12 +78,14 @@ backend:
       max-running-requests: 512
       load-balance-method: "round_robin"
       scheduler-recv-interval: 10
-      enable-flashinfer-allreduce-fusion: true
+      fp8-gemm-backend: "flashinfer_trtllm"
       enable-symm-mem: true
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       data-parallel-size: 1
       expert-parallel-size: 1
+ 
+      disaggregation-transfer-backend: nixl
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-R1"
@@ -94,19 +102,21 @@ backend:
       mem-fraction-static: 0.95
       chunked-prefill-size: 8192
       cuda-graph-max-bs: 128
-      max-running-requests: 512
+      max-running-requests: 128
       scheduler-recv-interval: 10
-      enable-flashinfer-allreduce-fusion: true
       enable-symm-mem: true
       moe-dense-tp-size: 1
       prefill-round-robin-balance: true
       tensor-parallel-size: 4
       data-parallel-size: 1
       expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+
+      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x32x64x80x96x112x128"
-  req_rate: "inf"
\ No newline at end of file
+  concurrencies: "4x8"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml
similarity index 90%
rename from recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml
rename to recipes/gb200-fp8/1k1k/max-tpt.yaml
index e1859cec..14f43d2a 100644
--- a/recipes/gb200-fp8/1k1k/max-tpt-2p1d.yaml
+++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml
@@ -1,10 +1,16 @@
-# GB200 FP8 Max Throughput Configuration
+name: "gb200-fp8-1k1k-max-tpt"
 
-name: "gb200-fp8-max-tpt"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
-  path: "dsfp8"
-  container: "0.5.5.post2"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
@@ -20,7 +26,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -37,7 +43,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
@@ -45,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -106,6 +111,8 @@ backend:
       ep-num-redundant-experts: 32
       deepep-config: "/configs/deepep_config.json"
 
+      disaggregation-transfer-backend: nixl
+
     decode:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
@@ -156,10 +163,12 @@ backend:
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
       cuda-graph-max-bs: 768
 
+      disaggregation-transfer-backend: nixl
+
 benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1024x2048x4096"
+  concurrencies: "1024x2048x4096x6144"
   req_rate: "inf"
 
diff --git a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml
similarity index 90%
rename from recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml
rename to recipes/gb200-fp8/1k1k/mid-curve.yaml
index 36bbfb7e..5ea1a036 100644
--- a/recipes/gb200-fp8/1k1k/mid-curve-3p1d.yaml
+++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml
@@ -1,10 +1,16 @@
-# GB200 FP8 Max Throughput Configuration
+name: "gb200-fp8-1k1k-mid-curve"
 
-name: "gb200-fp8-max-tpt"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
-  path: "dsfp8"
-  container: "0.5.5.post2"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
@@ -20,7 +26,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -37,7 +43,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
@@ -45,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -105,6 +110,7 @@ backend:
       enable-dp-lm-head: true
       ep-num-redundant-experts: 32
       deepep-config: "/configs/deepep_config.json"
+      disaggregation-transfer-backend: nixl
 
     decode:
       # Model configuration
@@ -113,9 +119,9 @@ backend:
       trust-remote-code: true
 
       # Parallelism
-      tp-size: 32
-      dp-size: 32
-      ep-size: 32
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
       enable-dp-attention: true
 
       # KV cache and attention
@@ -155,6 +161,8 @@ backend:
       # CUDA graphs
       cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
       cuda-graph-max-bs: 768
+      disaggregation-transfer-backend: nixl
+
 
 benchmark:
   type: "sa-bench"
diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml
new file mode 100644
index 00000000..6d3e893c
--- /dev/null
+++ b/recipes/gb200-fp8/1k8k/low_latency.yaml
@@ -0,0 +1,124 @@
+name: "gb200-fp8-1k8k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:  
+  type: dynamo  
+  enable_multiple_frontends: true     # Enable nginx + multiple routers  
+  num_additional_frontends: 2         # Additional routers (total = 1 + t
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 2
+  decode_nodes: 2
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 10000
+      disaggregation-mode: "prefill"
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 512
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false
+      fp8-gemm-backend: "flashinfer_trtllm"
+      enable-symm-mem: true
+      moe-dense-tp-size: 1
+      disaggregation-bootstrap-port: 30001
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 10000
+      disaggregation-mode: "decode"
+      mem-fraction-static: 0.95
+      chunked-prefill-size: 8192
+      cuda-graph-max-bs: 128
+      max-running-requests: 128
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false
+      enable-symm-mem: false #true
+      moe-dense-tp-size: 1
+      disaggregation-bootstrap-port: 30001
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 8
+      data-parallel-size: 1
+      expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 8192
+  concurrencies: "4x8x32x64x80x96x112x128"
+  req_rate: "inf"
diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml
new file mode 100644
index 00000000..8322b8b9
--- /dev/null
+++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml
@@ -0,0 +1,182 @@
+name: "gb200-fp8-1k1k-max-tpt"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true     # Enable nginx + multiple routers  
+  num_additional_frontends: 9         # Additional routers (total = 1 + t
+
+model:
+  path: "dsfp8"
+#  container: "sglang0p5p5ppost2"
+#  container: "sglang0p5p7"
+#  container: "sglang0p5p8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 4
+  prefill_workers: 2
+  decode_nodes: 8
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+#    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+#    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 10000
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 32
+      dp-size: 32
+      ep-size: 32
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 10000
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 8192
+  concurrencies: "1024x2048x4096"
+  req_rate: "inf"
+
diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml
new file mode 100644
index 00000000..19737dc4
--- /dev/null
+++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml
@@ -0,0 +1,173 @@
+name: "gb200-fp8-1k8k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  decode_nodes: 12
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 10000
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 10000
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+      disaggregation-transfer-backend: nixl
+
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 8192
+  concurrencies: "1024x2048x4096"
+  req_rate: "inf"
+
diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml
index 52ea1d89..56587e29 100644
--- a/recipes/gb200-fp8/8k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/8k1k/low-latency.yaml
@@ -1,14 +1,22 @@
-name: "gb200-fp8-8k1k-1p-1d-low-latency"
+name: "gb200-fp8-8k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true     # Enable nginx + multiple routers  
+  num_additional_frontends: 2         # Additional routers (total = 1 + t
 
 model:
-  path: "dsfp8"
-  container: "0.5.5.post2"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
   gpu_type: "gb200"
-  prefill_nodes: 1
-  decode_nodes: 1
+  prefill_nodes: 2
+  decode_nodes: 2
   prefill_workers: 1
   decode_workers: 1
   gpus_per_node: 4
@@ -18,9 +26,8 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
@@ -36,14 +43,12 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
@@ -64,18 +69,20 @@ backend:
       watchdog-timeout: 1000000
       context-length: 9600 
       disaggregation-mode: "prefill"
-      mem-fraction-static: 0.95
+      mem-fraction-static: 0.8
       max-total-tokens: 32768 
       chunked-prefill-size: 24576 
       cuda-graph-max-bs: 512 
       max-running-requests: 512
       load-balance-method: "round_robin"
       scheduler-recv-interval: 10
-      enable-flashinfer-allreduce-fusion: true
       moe-dense-tp-size: 1
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
       data-parallel-size: 1
       expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
 
     decode:
       served-model-name: "deepseek-ai/DeepSeek-R1"
@@ -88,18 +95,20 @@ backend:
       watchdog-timeout: 1000000
       context-length: 9600 
       disaggregation-mode: "decode"
-      mem-fraction-static: 0.95
+      mem-fraction-static: 0.8
       chunked-prefill-size: 8192
       cuda-graph-max-bs: 512 
       max-running-requests: 512
       scheduler-recv-interval: 10
-      enable-flashinfer-allreduce-fusion: true
       enable-symm-mem: true
       moe-dense-tp-size: 1
       prefill-round-robin-balance: true
-      tensor-parallel-size: 4
+      tensor-parallel-size: 8
       data-parallel-size: 1
       expert-parallel-size: 1
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"
diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml
new file mode 100644
index 00000000..0d90dddb
--- /dev/null
+++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml
@@ -0,0 +1,174 @@
+name: "gb200-8k1k-fp8-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
+model:
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 12
+  prefill_workers: 6
+  decode_nodes: 6
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" 
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+
+      # Memory and token limits
+      mem-fraction-static: 0.80
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 24
+      dp-size: 24
+      ep-size: 24
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 8192 
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512]
+      cuda-graph-max-bs: 512
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: "2048x4096x6144x8192x10240"
+  req_rate: "300"
diff --git a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml
similarity index 89%
rename from recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml
rename to recipes/gb200-fp8/8k1k/mid-curve.yaml
index 4c6fff6d..24d12de1 100644
--- a/recipes/gb200-fp8/8k1k/mid-curve-5p1d.yaml
+++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml
@@ -1,10 +1,16 @@
-# GB200 FP8 Mid curve Configuration
-
 name: "gb200-8k1k-fp8-mid-tpt"
 
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
 model:
-  path: "dsfp8"
-  container: "0.5.5.post2"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
@@ -20,7 +26,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -37,7 +43,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256"
     MC_TE_METRIC: "true"
@@ -45,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -105,6 +110,8 @@ backend:
       enable-dp-lm-head: true
       ep-num-redundant-experts: 32
       deepep-config: "/configs/deepep_config.json"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
 
     decode:
       # Model configuration
@@ -153,6 +160,8 @@ backend:
       deepep-config: "/configs/deepep_config.json"
       # CUDA graphs
       cuda-graph-max-bs: 256 
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"

From acb398b5dfc325677808cae17cc2fec3ba80262f Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Mon, 2 Feb 2026 14:44:51 -0800
Subject: [PATCH 2/6] Update GB200-FP4 configs

---
 recipes/gb200-fp4/1k1k/low-latency.yaml | 24 +++++++++++++++---------
 recipes/gb200-fp4/1k1k/max-tpt.yaml     | 22 ++++++++++++++--------
 recipes/gb200-fp4/1k1k/mid-curve.yaml   | 23 ++++++++++++++---------
 recipes/gb200-fp4/1k8k/low-latency.yaml | 24 +++++++++++++-----------
 recipes/gb200-fp4/1k8k/max-tpt.yaml     | 15 ++++++++-------
 recipes/gb200-fp4/1k8k/mid-curve.yaml   | 20 ++++++++++----------
 recipes/gb200-fp4/8k1k/low-latency.yaml | 23 +++++++++++++++--------
 recipes/gb200-fp4/8k1k/max-tpt.yaml     | 22 +++++++++++++++-------
 recipes/gb200-fp4/8k1k/mid-curve.yaml   | 22 +++++++++++++++-------
 9 files changed, 119 insertions(+), 76 deletions(-)

diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml
index b27f67ec..bdb3cb6f 100644
--- a/recipes/gb200-fp4/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp4/1k1k/low-latency.yaml
@@ -1,8 +1,16 @@
-name: "gb200-fp4-1p2d"
+name: "gb200-fp4-1k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 3
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -25,14 +33,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
@@ -44,14 +49,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   sglang_config:
     prefill:
@@ -76,6 +78,8 @@ backend:
       moe-dense-tp-size: 1
       load-balance-method: "round_robin"
       disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
@@ -100,6 +104,8 @@ backend:
       scheduler-recv-interval: 10
       enable-symm-mem: true
       moe-dense-tp-size: 1
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       tensor-parallel-size: 4
       expert-parallel-size: 1
 
@@ -108,4 +114,4 @@ benchmark:
   isl: 1024
   osl: 1024
   concurrencies: "4x8x32x64x112x128x256"
-  req_rate: "inf"
\ No newline at end of file
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml
index ba31ccfc..2cf26114 100644
--- a/recipes/gb200-fp4/1k1k/max-tpt.yaml
+++ b/recipes/gb200-fp4/1k1k/max-tpt.yaml
@@ -1,10 +1,16 @@
-# 4P1D, with 12 Decode Nodes. Uses single batch overlap
+name: "gb200-fp4-1k1k-max-tpt"
 
-name: "gb200-fp4-max-tpt"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -27,7 +33,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -46,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -56,8 +60,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -103,6 +105,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -162,6 +166,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48
@@ -173,4 +179,4 @@ benchmark:
   isl: 1024
   osl: 1024
   concurrencies: "1x128x512x2048x4096x8192x12000x15000"
-  req_rate: "inf"
\ No newline at end of file
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml
index 2365f2c0..277cc2c4 100644
--- a/recipes/gb200-fp4/1k1k/mid-curve.yaml
+++ b/recipes/gb200-fp4/1k1k/mid-curve.yaml
@@ -1,11 +1,16 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
-# per gpu throughput
+name: "gb200-fp4-1k1k-mid-curve"
 
-name: "gb200-fp4-max-tpt-2"
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -28,7 +33,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -47,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -57,8 +60,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -104,6 +105,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -162,6 +165,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32
@@ -173,4 +178,4 @@ benchmark:
   isl: 1024
   osl: 1024
   concurrencies: "1x128x512x2048x4096x8192x12000x15000"
-  req_rate: "inf"
\ No newline at end of file
+  req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml
index 6c2a9536..10944923 100644
--- a/recipes/gb200-fp4/1k8k/low-latency.yaml
+++ b/recipes/gb200-fp4/1k8k/low-latency.yaml
@@ -1,16 +1,16 @@
-name: "gb200-fp4-1p2d"
+name: "gb200-fp4-1k8k-low-latency"
 
 dynamo:
-  version: 0.7.0
+  version: 0.8.1
 
-frontend:
-  type: dynamo
+frontend:  
+  type: dynamo  
   enable_multiple_frontends: true
-  num_additional_frontends: 4
+  num_additional_frontends: 3
 
 model:
-  path: "dsr1"
-  container: "lmsysorg/sglang:v0.5.5.post2"
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -37,7 +37,6 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
@@ -54,12 +53,11 @@ backend:
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true"
 
   sglang_config:
     prefill:
-      disaggregation-mode: "prefill"
       served-model-name: "deepseek-ai/DeepSeek-R1"
+      disaggregation-mode: "prefill"
       trust-remote-code: true
       disable-radix-cache: true
       kv-cache-dtype: "fp8_e4m3"
@@ -81,10 +79,12 @@ backend:
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
 
     decode:
-      disaggregation-mode: "decode"
       served-model-name: "deepseek-ai/DeepSeek-R1"
+      disaggregation-mode: "decode"
       prefill-round-robin-balance: true
       trust-remote-code: true
       disable-radix-cache: true
@@ -103,6 +103,8 @@ backend:
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
+      fp4-gemm-backend: "flashinfer_trtllm"
+      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"
diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml
index d2c46140..68bd1928 100644
--- a/recipes/gb200-fp4/1k8k/max-tpt.yaml
+++ b/recipes/gb200-fp4/1k8k/max-tpt.yaml
@@ -1,7 +1,7 @@
-name: "gb200-fp4-max-tpt"
+name: "gb200-fp4-1k8k-max-tpt"
 
 dynamo:
-  version: 0.7.0
+  version: 0.8.1
 
 frontend:
   type: dynamo
@@ -9,8 +9,8 @@ frontend:
   num_additional_frontends: 9
 
 model:
-  path: "dsr1"
-  container: "lmsysorg/sglang:v0.5.5.post2"
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -32,7 +32,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -51,7 +50,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -61,7 +59,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
     SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
@@ -69,6 +66,7 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
+      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -108,6 +106,7 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 4
@@ -118,6 +117,7 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
+      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -233,6 +233,7 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48
diff --git a/recipes/gb200-fp4/1k8k/mid-curve.yaml b/recipes/gb200-fp4/1k8k/mid-curve.yaml
index bf455b72..c781fc7f 100644
--- a/recipes/gb200-fp4/1k8k/mid-curve.yaml
+++ b/recipes/gb200-fp4/1k8k/mid-curve.yaml
@@ -1,16 +1,16 @@
-name: "gb200-fp4-mid-curve"
+name: "gb200-fp4-1k8k-mid-curve"
 
 dynamo:
-  version: 0.7.0
+  version: 0.8.1
 
-frontend:
-  type: dynamo
+frontend:  
+  type: dynamo  
   enable_multiple_frontends: true
   num_additional_frontends: 9
 
 model:
-  path: "dsr1"
-  container: "lmsysorg/sglang:v0.5.5.post2"
+  path: "dsfp4"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -32,7 +32,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -51,7 +50,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -61,8 +59,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -73,6 +69,7 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
+      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -108,6 +105,7 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 4
@@ -122,6 +120,7 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
+      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -232,6 +231,7 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32
diff --git a/recipes/gb200-fp4/8k1k/low-latency.yaml b/recipes/gb200-fp4/8k1k/low-latency.yaml
index 73a88588..1fab3df1 100644
--- a/recipes/gb200-fp4/8k1k/low-latency.yaml
+++ b/recipes/gb200-fp4/8k1k/low-latency.yaml
@@ -1,8 +1,16 @@
-name: "gb200-8k1k-fp4-low-latency-1p_tp4/4d_tp4"
+name: "gb200-fp4-8k1k-low-latency"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -25,14 +33,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
@@ -44,14 +49,11 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
-    SGLANG_ENABLE_FLASHINFER_GEMM: "true" 
 
   sglang_config:
     prefill:
@@ -77,6 +79,8 @@ backend:
       load-balance-method: "round_robin"
       disaggregation-bootstrap-port: 30001
       data-parallel-size: 1
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       tensor-parallel-size: 4
       expert-parallel-size: 1
       enable-dp-attention: false
@@ -101,9 +105,12 @@ backend:
       scheduler-recv-interval: 10
       enable-symm-mem: true
       moe-dense-tp-size: 1
+      disaggregation-transfer-backend: nixl
+      fp4-gemm-backend: "flashinfer_trtllm"
       tensor-parallel-size: 4
       expert-parallel-size: 1
       enable-dp-attention: false
+
 benchmark:
   type: "sa-bench"
   isl: 8192 
diff --git a/recipes/gb200-fp4/8k1k/max-tpt.yaml b/recipes/gb200-fp4/8k1k/max-tpt.yaml
index 26b4629d..b54813ab 100644
--- a/recipes/gb200-fp4/8k1k/max-tpt.yaml
+++ b/recipes/gb200-fp4/8k1k/max-tpt.yaml
@@ -1,8 +1,16 @@
-name: "gb200-8k1k-fp4-max-10p_tp4/1d_dep32"
+name: "gb200-fp4-8k1k-max-tpt"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -25,7 +33,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -44,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -54,8 +60,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -99,6 +103,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -156,6 +162,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 32
@@ -166,5 +174,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024x2048x8192"
+  concurrencies: "1024x2048"
   req_rate: 700
diff --git a/recipes/gb200-fp4/8k1k/mid-curve.yaml b/recipes/gb200-fp4/8k1k/mid-curve.yaml
index f1e9bb41..95b33f54 100644
--- a/recipes/gb200-fp4/8k1k/mid-curve.yaml
+++ b/recipes/gb200-fp4/8k1k/mid-curve.yaml
@@ -1,8 +1,16 @@
-name: "gb200-8k1k-fp4-mid-6p_tp4/1d_dep48"
+name: "gb200-fp4-8k1k-mid-curve"
+
+dynamo:
+  version: 0.8.1
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsfp4"
-  container: "0.5.5.post2"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp4"
 
 resources:
@@ -25,7 +33,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -44,7 +51,6 @@ backend:
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
-    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
     MC_TE_METRIC: "true"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
@@ -54,8 +60,6 @@ backend:
     SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
-    SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
-    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -99,6 +103,8 @@ backend:
       # Performance optimizations
       disable-cuda-graph: true
       enable-dp-attention: false
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 4
@@ -156,6 +162,8 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
+      fp4-gemm-backend: "flashinfer_cutlass"
+      disaggregation-transfer-backend: nixl
 
       # Parallelism
       tp-size: 48
@@ -167,4 +175,4 @@ benchmark:
   isl: 8192
   osl: 1024
   concurrencies: "512x1024x2048x4096"
-  req_rate: 700
\ No newline at end of file
+  req_rate: 700

From 6fa4eb6fb02dc3cc74de848cdf41b3d42df80519 Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Mon, 2 Feb 2026 16:31:14 -0800
Subject: [PATCH 3/6] Add nginx container to all GB200-FP8 configs

---
 recipes/gb200-fp8/1k1k/low-latency.yaml |  1 +
 recipes/gb200-fp8/1k1k/max-tpt.yaml     |  1 +
 recipes/gb200-fp8/1k1k/mid-curve.yaml   |  1 +
 recipes/gb200-fp8/1k8k/low_latency.yaml |  7 ++++---
 recipes/gb200-fp8/1k8k/max_tpt.yaml     | 17 ++++++-----------
 recipes/gb200-fp8/1k8k/mid_curve.yaml   |  3 ++-
 recipes/gb200-fp8/8k1k/low-latency.yaml |  7 ++++---
 recipes/gb200-fp8/8k1k/max_tpt.yaml     |  3 ++-
 recipes/gb200-fp8/8k1k/mid-curve.yaml   |  1 +
 9 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml
index 11151232..92cb098d 100644
--- a/recipes/gb200-fp8/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/1k1k/low-latency.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 2
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
diff --git a/recipes/gb200-fp8/1k1k/max-tpt.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml
index 14f43d2a..e20bfd84 100644
--- a/recipes/gb200-fp8/1k1k/max-tpt.yaml
+++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
diff --git a/recipes/gb200-fp8/1k1k/mid-curve.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml
index 5ea1a036..95eae698 100644
--- a/recipes/gb200-fp8/1k1k/mid-curve.yaml
+++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml
index 6d3e893c..73eda59f 100644
--- a/recipes/gb200-fp8/1k8k/low_latency.yaml
+++ b/recipes/gb200-fp8/1k8k/low_latency.yaml
@@ -5,8 +5,9 @@ dynamo:
 
 frontend:  
   type: dynamo  
-  enable_multiple_frontends: true     # Enable nginx + multiple routers  
-  num_additional_frontends: 2         # Additional routers (total = 1 + t
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
@@ -120,5 +121,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 8192
-  concurrencies: "4x8x32x64x80x96x112x128"
+  concurrencies: "4x8"
   req_rate: "inf"
diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml
index 8322b8b9..749b1bb8 100644
--- a/recipes/gb200-fp8/1k8k/max_tpt.yaml
+++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml
@@ -1,22 +1,17 @@
 name: "gb200-fp8-1k1k-max-tpt"
 
-extra_mount: # add this if you need to mount extra directories to the container
-  - "/lustre:/lustre"
-
 dynamo:
   version: 0.8.1
 
 frontend:
   type: dynamo
-  enable_multiple_frontends: true     # Enable nginx + multiple routers  
-  num_additional_frontends: 9         # Additional routers (total = 1 + t
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
-  path: "dsfp8"
-#  container: "sglang0p5p5ppost2"
-#  container: "sglang0p5p7"
-#  container: "sglang0p5p8"
-  container: "sglang0p5p8_cu13"
+  path: "dsr1-fp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
   precision: "fp8"
 
 resources:
@@ -177,6 +172,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 8192
-  concurrencies: "1024x2048x4096"
+  concurrencies: "1024x2048x4096x6144"
   req_rate: "inf"
 
diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml
index 19737dc4..452b033b 100644
--- a/recipes/gb200-fp8/1k8k/mid_curve.yaml
+++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
@@ -168,6 +169,6 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 8192
-  concurrencies: "1024x2048x4096"
+  concurrencies: "1024x2048x4096x6144"
   req_rate: "inf"
 
diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml
index 56587e29..c73c7a8f 100644
--- a/recipes/gb200-fp8/8k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/8k1k/low-latency.yaml
@@ -5,8 +5,9 @@ dynamo:
 
 frontend:
   type: dynamo
-  enable_multiple_frontends: true     # Enable nginx + multiple routers  
-  num_additional_frontends: 2         # Additional routers (total = 1 + t
+  enable_multiple_frontends: true
+  num_additional_frontends: 2
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
@@ -114,5 +115,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192 
   osl: 1024
-  concurrencies: "4x8x16x32"
+  concurrencies: "4x8x16"
   req_rate: "inf"
diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml
index 0d90dddb..b48de751 100644
--- a/recipes/gb200-fp8/8k1k/max_tpt.yaml
+++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"
@@ -170,5 +171,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192 
   osl: 1024
-  concurrencies: "2048x4096x6144x8192x10240"
+  concurrencies: "2048x4096x6144"
   req_rate: "300"
diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml
index 24d12de1..bd0fb9ef 100644
--- a/recipes/gb200-fp8/8k1k/mid-curve.yaml
+++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsr1-fp8"

From 55e80e9b41372fe192a32b5454d8ed3bdfd1e42f Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Mon, 2 Feb 2026 16:45:48 -0800
Subject: [PATCH 4/6] Add nginx container to GB200-FP4 configs

---
 recipes/gb200-fp4/1k1k/low-latency.yaml | 3 ++-
 recipes/gb200-fp4/1k1k/max-tpt.yaml     | 3 ++-
 recipes/gb200-fp4/1k1k/mid-curve.yaml   | 3 ++-
 recipes/gb200-fp4/1k8k/low-latency.yaml | 1 +
 recipes/gb200-fp4/1k8k/max-tpt.yaml     | 3 ++-
 recipes/gb200-fp4/1k8k/mid-curve.yaml   | 1 +
 recipes/gb200-fp4/8k1k/low-latency.yaml | 3 ++-
 recipes/gb200-fp4/8k1k/max-tpt.yaml     | 3 ++-
 recipes/gb200-fp4/8k1k/mid-curve.yaml   | 3 ++-
 9 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/recipes/gb200-fp4/1k1k/low-latency.yaml b/recipes/gb200-fp4/1k1k/low-latency.yaml
index bdb3cb6f..c953d991 100644
--- a/recipes/gb200-fp4/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp4/1k1k/low-latency.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 3
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -113,5 +114,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "4x8x32x64x112x128x256"
+  concurrencies: "4x8x32"
   req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/max-tpt.yaml b/recipes/gb200-fp4/1k1k/max-tpt.yaml
index 2cf26114..4bfd6ccf 100644
--- a/recipes/gb200-fp4/1k1k/max-tpt.yaml
+++ b/recipes/gb200-fp4/1k1k/max-tpt.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -178,5 +179,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1x128x512x2048x4096x8192x12000x15000"
+  concurrencies: "512x2048x4096"
   req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k1k/mid-curve.yaml b/recipes/gb200-fp4/1k1k/mid-curve.yaml
index 277cc2c4..d87d9a5a 100644
--- a/recipes/gb200-fp4/1k1k/mid-curve.yaml
+++ b/recipes/gb200-fp4/1k1k/mid-curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -177,5 +178,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 1024
-  concurrencies: "1x128x512x2048x4096x8192x12000x15000"
+  concurrencies: "512x2048x4096x8192x15000"
   req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k8k/low-latency.yaml b/recipes/gb200-fp4/1k8k/low-latency.yaml
index 10944923..1153983f 100644
--- a/recipes/gb200-fp4/1k8k/low-latency.yaml
+++ b/recipes/gb200-fp4/1k8k/low-latency.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo  
   enable_multiple_frontends: true
   num_additional_frontends: 3
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
diff --git a/recipes/gb200-fp4/1k8k/max-tpt.yaml b/recipes/gb200-fp4/1k8k/max-tpt.yaml
index 68bd1928..8d75c7be 100644
--- a/recipes/gb200-fp4/1k8k/max-tpt.yaml
+++ b/recipes/gb200-fp4/1k8k/max-tpt.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -244,5 +245,5 @@ benchmark:
   type: "sa-bench"
   isl: 1024
   osl: 8192
-  concurrencies: "256x512x1024x2048x8192"
+  concurrencies: "256x512x1024x2048"
   req_rate: "inf"
diff --git a/recipes/gb200-fp4/1k8k/mid-curve.yaml b/recipes/gb200-fp4/1k8k/mid-curve.yaml
index c781fc7f..01141454 100644
--- a/recipes/gb200-fp4/1k8k/mid-curve.yaml
+++ b/recipes/gb200-fp4/1k8k/mid-curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo  
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
diff --git a/recipes/gb200-fp4/8k1k/low-latency.yaml b/recipes/gb200-fp4/8k1k/low-latency.yaml
index 1fab3df1..f274b863 100644
--- a/recipes/gb200-fp4/8k1k/low-latency.yaml
+++ b/recipes/gb200-fp4/8k1k/low-latency.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 4
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -115,5 +116,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192 
   osl: 1024
-  concurrencies: "4x8x32x64"
+  concurrencies: "4x8"
   req_rate: 300 
diff --git a/recipes/gb200-fp4/8k1k/max-tpt.yaml b/recipes/gb200-fp4/8k1k/max-tpt.yaml
index b54813ab..2164891a 100644
--- a/recipes/gb200-fp4/8k1k/max-tpt.yaml
+++ b/recipes/gb200-fp4/8k1k/max-tpt.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -174,5 +175,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "1024x2048"
+  concurrencies: "2048"
   req_rate: 700
diff --git a/recipes/gb200-fp4/8k1k/mid-curve.yaml b/recipes/gb200-fp4/8k1k/mid-curve.yaml
index 95b33f54..43ed73fd 100644
--- a/recipes/gb200-fp4/8k1k/mid-curve.yaml
+++ b/recipes/gb200-fp4/8k1k/mid-curve.yaml
@@ -7,6 +7,7 @@ frontend:
   type: dynamo
   enable_multiple_frontends: true
   num_additional_frontends: 9
+  nginx_container: nginx
 
 model:
   path: "dsfp4"
@@ -174,5 +175,5 @@ benchmark:
   type: "sa-bench"
   isl: 8192
   osl: 1024
-  concurrencies: "512x1024x2048x4096"
+  concurrencies: "512x2048x4096"
   req_rate: 700

From 6ff475f2197471bb30687cb00a987fac81179a59 Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Tue, 3 Feb 2026 13:34:16 -0800
Subject: [PATCH 5/6] Cleanup configs

---
 recipes/gb200-fp8/1k1k/low-latency.yaml | 4 ++--
 recipes/gb200-fp8/8k1k/max_tpt.yaml     | 4 ----
 recipes/gb200-fp8/8k1k/mid-curve.yaml   | 2 --
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml
index 92cb098d..e0be8170 100644
--- a/recipes/gb200-fp8/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/1k1k/low-latency.yaml
@@ -85,7 +85,7 @@ backend:
       tensor-parallel-size: 4
       data-parallel-size: 1
       expert-parallel-size: 1
- 
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
     decode:
@@ -112,7 +112,7 @@ backend:
       data-parallel-size: 1
       expert-parallel-size: 1
       fp8-gemm-backend: "flashinfer_trtllm"
-
+      disaggregation-bootstrap-port: 30001
       disaggregation-transfer-backend: nixl
 
 benchmark:
diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml
index b48de751..2032a01f 100644
--- a/recipes/gb200-fp8/8k1k/max_tpt.yaml
+++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml
@@ -87,8 +87,6 @@ backend:
       watchdog-timeout: 1000000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
-      init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt" 
-      disaggregation-bootstrap-port: 30001
 
       # Prefill-specific mode
       disaggregation-mode: "prefill"
@@ -142,8 +140,6 @@ backend:
       watchdog-timeout: 1000000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
-      init-expert-location: "/configs/expert-distributions/expert_distribution_fp8_8k1k_compressed.pt"
-      disaggregation-bootstrap-port: 30001
 
       # Decode-specific mode
       disaggregation-mode: "decode"
diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml
index bd0fb9ef..844d464b 100644
--- a/recipes/gb200-fp8/8k1k/mid-curve.yaml
+++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml
@@ -87,7 +87,6 @@ backend:
       watchdog-timeout: 1000000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
-      disaggregation-bootstrap-port: 30001
 
       # Prefill-specific mode
       disaggregation-mode: "prefill"
@@ -141,7 +140,6 @@ backend:
       watchdog-timeout: 1000000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
-      disaggregation-bootstrap-port: 30001
 
       # Decode-specific mode
       disaggregation-mode: "decode"

From 41385b5442a95e3b206abbcad472bd2923a5541a Mon Sep 17 00:00:00 2001
From: Kyle Liang <kylliang@nvidia.com>
Date: Tue, 3 Feb 2026 17:43:48 -0800
Subject: [PATCH 6/6] Switch to use fast DG cache compile

---
 recipes/gb200-fp8/1k1k/low-latency.yaml | 6 +++---
 recipes/gb200-fp8/1k1k/max-tpt.yaml     | 6 +++---
 recipes/gb200-fp8/1k1k/mid-curve.yaml   | 6 +++---
 recipes/gb200-fp8/1k8k/low_latency.yaml | 6 +++---
 recipes/gb200-fp8/1k8k/max_tpt.yaml     | 8 +++-----
 recipes/gb200-fp8/1k8k/mid_curve.yaml   | 6 +++---
 recipes/gb200-fp8/8k1k/low-latency.yaml | 6 +++---
 recipes/gb200-fp8/8k1k/max_tpt.yaml     | 6 +++---
 recipes/gb200-fp8/8k1k/mid-curve.yaml   | 6 +++---
 9 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/recipes/gb200-fp8/1k1k/low-latency.yaml b/recipes/gb200-fp8/1k1k/low-latency.yaml
index e0be8170..4349af7d 100644
--- a/recipes/gb200-fp8/1k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/1k1k/low-latency.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
@@ -44,7 +44,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configsdg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
diff --git a/recipes/gb200-fp8/1k1k/max-tpt.yaml b/recipes/gb200-fp8/1k1k/max-tpt.yaml
index e20bfd84..2e6dfcbe 100644
--- a/recipes/gb200-fp8/1k1k/max-tpt.yaml
+++ b/recipes/gb200-fp8/1k1k/max-tpt.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -44,7 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
diff --git a/recipes/gb200-fp8/1k1k/mid-curve.yaml b/recipes/gb200-fp8/1k1k/mid-curve.yaml
index 95eae698..7b59b995 100644
--- a/recipes/gb200-fp8/1k1k/mid-curve.yaml
+++ b/recipes/gb200-fp8/1k1k/mid-curve.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -44,7 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
diff --git a/recipes/gb200-fp8/1k8k/low_latency.yaml b/recipes/gb200-fp8/1k8k/low_latency.yaml
index 73eda59f..a24ea169 100644
--- a/recipes/gb200-fp8/1k8k/low_latency.yaml
+++ b/recipes/gb200-fp8/1k8k/low_latency.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
@@ -44,7 +44,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
diff --git a/recipes/gb200-fp8/1k8k/max_tpt.yaml b/recipes/gb200-fp8/1k8k/max_tpt.yaml
index 749b1bb8..904acf89 100644
--- a/recipes/gb200-fp8/1k8k/max_tpt.yaml
+++ b/recipes/gb200-fp8/1k8k/max_tpt.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,8 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
-#    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -45,8 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
-#    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.5.post2"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
diff --git a/recipes/gb200-fp8/1k8k/mid_curve.yaml b/recipes/gb200-fp8/1k8k/mid_curve.yaml
index 452b033b..5c894a80 100644
--- a/recipes/gb200-fp8/1k8k/mid_curve.yaml
+++ b/recipes/gb200-fp8/1k8k/mid_curve.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -44,7 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
diff --git a/recipes/gb200-fp8/8k1k/low-latency.yaml b/recipes/gb200-fp8/8k1k/low-latency.yaml
index c73c7a8f..83a12e2e 100644
--- a/recipes/gb200-fp8/8k1k/low-latency.yaml
+++ b/recipes/gb200-fp8/8k1k/low-latency.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
@@ -44,7 +44,7 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
diff --git a/recipes/gb200-fp8/8k1k/max_tpt.yaml b/recipes/gb200-fp8/8k1k/max_tpt.yaml
index 2032a01f..d78dafb2 100644
--- a/recipes/gb200-fp8/8k1k/max_tpt.yaml
+++ b/recipes/gb200-fp8/8k1k/max_tpt.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -44,7 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
     MC_TE_METRIC: "true"
diff --git a/recipes/gb200-fp8/8k1k/mid-curve.yaml b/recipes/gb200-fp8/8k1k/mid-curve.yaml
index 844d464b..98674cc6 100644
--- a/recipes/gb200-fp8/8k1k/mid-curve.yaml
+++ b/recipes/gb200-fp8/8k1k/mid-curve.yaml
@@ -11,7 +11,7 @@ frontend:
 
 model:
   path: "dsr1-fp8"
-  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -27,7 +27,7 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -44,7 +44,7 @@ backend:
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-0.5.8_cu13"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "256"
     MC_TE_METRIC: "true"