ishandhanani · kyleliang-nv · Jan 24, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.gitignore b/.gitignore
@@ -51,4 +51,8 @@ configs/*.tar.gz
 .ruff_cache/
 *.egg-info/
 
-.coverage
+.coverage
+
+configs/dg-*
+configs/flashinfer-cache/
+outputs/*
diff --git a/recipies/gb200-fp4/1k8k/low-latency.yaml b/recipies/gb200-fp4/1k8k/low-latency.yaml
@@ -1,8 +1,16 @@
 name: "gb200-fp4-1p2d"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 4
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -24,8 +32,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    #SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    #SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -43,8 +49,6 @@ backend:
     SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
     SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
     SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
-    # SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
-    # SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
     MC_FORCE_MNNVL: "1"
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
@@ -64,7 +68,7 @@ backend:
       moe-runner-backend: "flashinfer_trtllm"
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       max-total-tokens: 8192
       chunked-prefill-size: 8192
@@ -77,7 +81,6 @@ backend:
       data-parallel-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
     decode:
       disaggregation-mode: "decode"
@@ -92,15 +95,14 @@ backend:
       disaggregation-bootstrap-port: 30001
       stream-interval: 10
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       mem-fraction-static: 0.95
       chunked-prefill-size: 8192
       cuda-graph-max-bs: 256
       scheduler-recv-interval: 10
       moe-dense-tp-size: 1
       tensor-parallel-size: 4
       expert-parallel-size: 1
-      disaggregation-transfer-backend: nixl
 
 benchmark:
   type: "sa-bench"

diff --git a/recipies/gb200-fp4/1k8k/max-tpt.yaml b/recipies/gb200-fp4/1k8k/max-tpt.yaml
@@ -1,10 +1,16 @@
-# 4P1D, with 12 Decode Nodes. Uses single batch overlap
-
 name: "gb200-fp4-max-tpt"
 
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
+
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -56,13 +62,13 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -80,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -112,7 +118,6 @@ backend:
       # Model configuration
       served-model-name: "deepseek-ai/DeepSeek-R1"
       trust-remote-code: true
-      disaggregation-transfer-backend: nixl
 
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
@@ -130,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +233,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 48

diff --git a/recipies/gb200-fp4/1k8k/mid-curve.yaml b/recipies/gb200-fp4/1k8k/mid-curve.yaml
@@ -1,11 +1,16 @@
-# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
-# per gpu throughput
+name: "gb200-fp4-mid-curve"
 
-name: "gb200-fp4-max-tpt-2"
+dynamo:
+  version: 0.7.0
+
+frontend:
+  type: dynamo
+  enable_multiple_frontends: true
+  num_additional_frontends: 9
 
 model:
   path: "dsr1"
-  container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
+  container: "lmsysorg/sglang:v0.5.5.post2"
   precision: "fp4"
 
 resources:
@@ -57,6 +62,7 @@ backend:
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
     SGLANG_MOE_NVFP4_DISPATCH: "1"
     SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
+    SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
 
   sglang_config:
     prefill:
@@ -67,7 +73,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -81,7 +86,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -117,7 +122,6 @@ backend:
       # KV cache and attention
       kv-cache-dtype: "fp8_e4m3"
       attention-backend: "trtllm_mla"
-      disaggregation-transfer-backend: nixl
 
       # Quantization
       quantization: "modelopt_fp4"
@@ -131,7 +135,7 @@ backend:
       stream-interval: 50
       decode-log-interval: 1000
       watchdog-timeout: 1000000
-      context-length: 9200
+      context-length: 10000
       disable-shared-experts-fusion: true
       eplb-algorithm: "deepseek"
       disaggregation-bootstrap-port: 30001
@@ -228,7 +232,6 @@ backend:
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
       enable-dp-attention: true
-      fp4-gemm-backend: "flashinfer_cutlass"
 
       # Parallelism
       tp-size: 32

diff --git a/recipies/h200/1k1k/bs128-agg-tp.yaml b/recipies/h200/1k1k/bs128-agg-tp.yaml
@@ -0,0 +1,59 @@
+name: "agg-tp-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  agg_nodes: 1
+  agg_workers: 1
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    aggregated:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 1
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 10
+      max-running-requests: 512  # sum of all dp
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-prefill-tokens: 32768
+      chunked-prefill-size: 32768
+
+      # CUDA graphs
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1x4x16x32x64x128x256x512"
+  req_rate: "inf"
diff --git a/recipies/h200/1k1k/bs256-1p6d-dep.yaml b/recipies/h200/1k1k/bs256-1p6d-dep.yaml
@@ -0,0 +1,100 @@
+name: "bs256-1p6d-h200-fp8"
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
+  precision: "fp8"
+
+resources:
+  gpu_type: "h200"
+  prefill_nodes: 1
+  prefill_workers: 1
+  decode_nodes: 6
+  decode_workers: 6
+  gpus_per_node: 8
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  # Decode-specific environment variables
+  decode_environment:
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      # stream-interval: 50
+      max-running-requests: 512
+
+
+      # Prefill-specific mode
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-prefill-tokens: 65536
+      chunked-prefill-size: 262144
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      model-path: "/model/"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "flashinfer"
+
+      # Other flags
+      disable-radix-cache: true
+      stream-interval: 10
+
+      # Disagg
+      disaggregation-bootstrap-port: 30001
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      max-running-requests: 512
+      cuda-graph-max-bs: 512
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "128x256x512x1024x2048"
+  req_rate: "inf"
+