ishandhanani · ishandhanani · Feb 4, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/recipes/gb200-fp8/1k1k/low-latency-mtp.yaml b/recipes/gb200-fp8/1k1k/low-latency-mtp.yaml
@@ -1,8 +1,11 @@
 name: "gb200-fp8-1p-2d-low-latency-mtp"
 
+frontend:
+  nginx_container: nginx
+
 model:
   path: "dsfp8"
-  container: "0.5.8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -18,7 +21,6 @@ backend:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -32,12 +34,13 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
 
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
     PYTHONUNBUFFERED: "1"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
     SGLANG_ENABLE_JIT_DEEPGEMM: "false"
     SGLANG_ENABLE_FLASHINFER_GEMM: "1"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -53,6 +56,8 @@ backend:
     NCCL_MNNVL_ENABLE: "1"
     NCCL_CUMEM_ENABLE: "1"
     SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
 
   sglang_config:
     prefill:
@@ -81,6 +86,7 @@ backend:
       tensor-parallel-size: 4
       data-parallel-size: 1
       expert-parallel-size: 1
+      disaggregation-transfer-backend: "nixl"
       speculative-algorithm: "EAGLE"
       speculative-num-steps: 2
       speculative-eagle-topk: 1
@@ -110,6 +116,7 @@ backend:
       tensor-parallel-size: 8
       data-parallel-size: 1
       expert-parallel-size: 1
+      disaggregation-transfer-backend: "nixl"
       speculative-algorithm: "EAGLE"
       speculative-num-steps: 2
       speculative-eagle-topk: 1

diff --git a/recipes/gb200-fp8/1k1k/max-tpt-2p1d-mtp.yaml b/recipes/gb200-fp8/1k1k/max-tpt-2p1d-mtp.yaml
@@ -1,10 +1,13 @@
 # GB200 FP8 Max Throughput Configuration
 
-name: "gb200-fp8-max-tpt-mtp"
+name: "gb200-fp8-max-tpt-2p1d-mtp"
+
+frontend:
+  nginx_container: nginx
 
 model:
   path: "dsfp8"
-  container: "0.5.8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
   precision: "fp8"
 
 resources:
@@ -19,7 +22,6 @@ backend:
   # Prefill-specific environment variables
   prefill_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-01232026-{node_id}"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     MC_TE_METRIC: "true"
     SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
@@ -36,11 +38,11 @@ backend:
     SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
     SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBO: "1"
     FLASHINFER_WORKSPACE_BASE: "/configs/"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
 
   # Decode-specific environment variables
   decode_environment:
     TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
-    SGLANG_DG_CACHE_DIR: "/configs/dg-01232026-{node_id}"
     DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
     SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
     MC_TE_METRIC: "true"
@@ -60,6 +62,7 @@ backend:
     SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
     SGLANG_BLACKWELL_OVERLAP_SHARED_EXPERTS_OUTSIDE_SBO: "1"
     FLASHINFER_WORKSPACE_BASE: "/configs/"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
 
   sglang_config:
     prefill:
@@ -92,7 +95,7 @@ backend:
 
       # Prefill-specific mode
       disaggregation-mode: "prefill"
-      #disaggregation-transfer-backend: "nixl"
+      disaggregation-transfer-backend: "nixl"
 
       # Memory and token limits
       mem-fraction-static: 0.75
@@ -153,7 +156,7 @@ backend:
 
       # Decode-specific mode
       disaggregation-mode: "decode"
-      #disaggregation-transfer-backend: "nixl"
+      disaggregation-transfer-backend: "nixl"
 
       # Memory and token limits
       mem-fraction-static: 0.75
@@ -167,54 +170,17 @@ backend:
       moe-dense-tp-size: 1
       enable-dp-lm-head: true
       prefill-round-robin-balance: true
-      ep-num-redundant-experts: 32
       deepep-config: "/configs/deepep_config.json"
 
       # CUDA graphs
-      cuda-graph-bs: [
-          1,
-          2,
-          4,
-          8,
-          16,
-          24,
-          32,
-          40,
-          48,
-          56,
-          64,
-          72,
-          80,
-          88,
-          96,
-          104,
-          112,
-          120,
-          128,
-          136,
-          144,
-          152,
-          160,
-          168,
-          176,
-          184,
-          192,
-          200,
-          208,
-          216,
-          224,
-          232,
-          240,
-          248,
-          256,
-        ] #, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
-      cuda-graph-max-bs: 256
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384]
+      cuda-graph-max-bs: 384
 
       # MTP
       speculative-algorithm: "EAGLE"
-      speculative-num-steps: 2
+      speculative-num-steps: 1
       speculative-eagle-topk: 1
-      speculative-num-draft-tokens: 3
+      speculative-num-draft-tokens: 2
 
 benchmark:
   type: "sa-bench"

diff --git a/recipes/gb200-fp8/1k1k/mid-curve-3p1d-mtp.yaml b/recipes/gb200-fp8/1k1k/mid-curve-3p1d-mtp.yaml
@@ -0,0 +1,186 @@
+# GB200 FP8 Max Throughput Configuration
+
+name: "gb200-fp8-mid-curve-3p1d-mtp"
-# GB200 FP8 Max Throughput Configuration
-
-name: "gb200-fp8-mid-curve-3p1d-mtp"
+# GB200 FP8 Mid-Curve 3P1D MTP Configuration
+
+name: "gb200-fp8-mid-curve-3p1d-mtp"
-# GB200 FP8 Max Throughput Configuration
-
-name: "gb200-fp8-mid-curve-3p1d-mtp"
+# GB200 FP8 Mid-Curve 3P1D MTP Configuration
+
+name: "gb200-fp8-mid-curve-3p1d-mtp"
+
+frontend:
+  nginx_container: nginx
+
+model:
+  path: "dsfp8"
+  container: "lmsysorg/sglang:v0.5.8-cu130"
+  precision: "fp8"
+
+resources:
+  gpu_type: "gb200"
+  prefill_nodes: 6
+  prefill_workers: 3
+  decode_nodes: 12
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+    SGLANG_ENABLE_SPEC_V2: "1"
+    SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
+    SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: "nixl"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 1
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 2
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 48
+      dp-size: 48
+      ep-size: 48
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: "nixl"
+
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384]
+      cuda-graph-max-bs: 384
+
+      # MTP
+      speculative-algorithm: "EAGLE"
+      speculative-num-steps: 1
+      speculative-eagle-topk: 1
+      speculative-num-draft-tokens: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1024x2048x4096"
+  req_rate: "inf"
+