From c02d59d22afe402b9613723c8dd1e00fc0763f0a Mon Sep 17 00:00:00 2001
From: Grace Ho <grho@login-lyris01.lyris.clusters.nvidia.com>
Date: Thu, 29 Jan 2026 12:04:17 -0800
Subject: [PATCH 1/2] configs for gb300-fp8-no-mtp

---
 recipies/gb300-fp8/1k1k/stp/low-latency.yaml | 124 ++++++++++++++
 recipies/gb300-fp8/1k1k/stp/max.yaml         | 170 +++++++++++++++++++
 recipies/gb300-fp8/1k1k/stp/mid.yaml         | 169 ++++++++++++++++++
 recipies/gb300-fp8/8k1k/stp/low-latency.yaml | 123 ++++++++++++++
 recipies/gb300-fp8/8k1k/stp/max.yaml         | 170 +++++++++++++++++++
 recipies/gb300-fp8/8k1k/stp/mid.yaml         | 170 +++++++++++++++++++
 6 files changed, 926 insertions(+)
 create mode 100644 recipies/gb300-fp8/1k1k/stp/low-latency.yaml
 create mode 100644 recipies/gb300-fp8/1k1k/stp/max.yaml
 create mode 100644 recipies/gb300-fp8/1k1k/stp/mid.yaml
 create mode 100644 recipies/gb300-fp8/8k1k/stp/low-latency.yaml
 create mode 100644 recipies/gb300-fp8/8k1k/stp/max.yaml
 create mode 100644 recipies/gb300-fp8/8k1k/stp/mid.yaml

diff --git a/recipies/gb300-fp8/1k1k/stp/low-latency.yaml b/recipies/gb300-fp8/1k1k/stp/low-latency.yaml
new file mode 100644
index 00000000..e67bc736
--- /dev/null
+++ b/recipies/gb300-fp8/1k1k/stp/low-latency.yaml
@@ -0,0 +1,124 @@
+name: "gb300-1k1k-fp8-low-latency"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 4
+  prefill_workers: 1
+  decode_workers: 4
+  gpus_per_node: 4
+
+slurm:
+  time_limit: "02:00:00"
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.95
+      max-total-tokens: 8192 
+      chunked-prefill-size: 8192
+      max-prefill-tokens: 8192 
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false # to save mem
+      enable-symm-mem: false # to save mem 
+      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 2200
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      chunked-prefill-size: -1 # save mem
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      scheduler-recv-interval: 1  # save mem
+      enable-flashinfer-allreduce-fusion: false # to save mem 
+      enable-symm-mem: false # to save mem
+      moe-dense-tp-size: 1
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4,8,16,32]
+  req_rate: "inf"
+
diff --git a/recipies/gb300-fp8/1k1k/stp/max.yaml b/recipies/gb300-fp8/1k1k/stp/max.yaml
new file mode 100644
index 00000000..2d7c65dd
--- /dev/null
+++ b/recipies/gb300-fp8/1k1k/stp/max.yaml
@@ -0,0 +1,170 @@
+# GB200 FP8 Max Throughput Configuration
+
+name: "gb300-1k1k-fp8-max"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 2
+  prefill_workers: 1
+  decode_nodes: 2 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 8 
+      dp-size: 8 
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
+      cuda-graph-max-bs: 1024 
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [4096,7168,7680]
+  req_rate: "inf"
+
diff --git a/recipies/gb300-fp8/1k1k/stp/mid.yaml b/recipies/gb300-fp8/1k1k/stp/mid.yaml
new file mode 100644
index 00000000..77df3500
--- /dev/null
+++ b/recipies/gb300-fp8/1k1k/stp/mid.yaml
@@ -0,0 +1,169 @@
+# GB200 FP8 Max Throughput Configuration
+name: "gb300-1k1k-fp8-mid"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 4
+  prefill_workers: 2
+  decode_nodes: 8 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 2200
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32 
+      dp-size: 32 
+      ep-size: 32 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 2200
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: [1024,2048,4096,6144]
+  req_rate: "inf"
+
diff --git a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml
new file mode 100644
index 00000000..27300e8b
--- /dev/null
+++ b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml
@@ -0,0 +1,123 @@
+name: "gb300-8k1k-fp8-low-latency"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 1
+  decode_nodes: 1 
+  prefill_workers: 1
+  decode_workers: 1
+  gpus_per_node: 4
+
+slurm:
+  time_limit: "02:00:00"
+
+backend:
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    PYTHONUNBUFFERED: "1"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    SGLANG_ENABLE_JIT_DEEPGEMM: "false"
+    # SGLANG_ENABLE_FLASHINFER_GEMM: "1" # deprecated in 0.5.7, --fp8-gemm-backend=flashinfer_trtllm
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    MC_TE_METRIC: "true"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+
+  sglang_config:
+    prefill:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 9300
+      disaggregation-mode: "prefill"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.95
+      max-total-tokens: 32768 
+      chunked-prefill-size: 32768 
+      max-prefill-tokens: 32768 
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      load-balance-method: "round_robin"
+      scheduler-recv-interval: 10
+      enable-flashinfer-allreduce-fusion: false # to save mem
+      enable-symm-mem: false # to save mem 
+      moe-dense-tp-size: 1
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+    decode:
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      trust-remote-code: true
+      kv-cache-dtype: "fp8_e4m3"
+      attention-backend: "trtllm_mla"
+      quantization: "fp8"
+      moe-runner-backend: "flashinfer_trtllm"
+      fp8-gemm-backend: "flashinfer_trtllm"
+      disable-radix-cache: true
+      stream-interval: 10
+      watchdog-timeout: 1000000
+      context-length: 9300
+      disaggregation-mode: "decode"
+      disaggregation-transfer-backend: nixl
+      mem-fraction-static: 0.85
+      chunked-prefill-size: -1 # save mem
+      cuda-graph-max-bs: 128
+      max-running-requests: 128 
+      scheduler-recv-interval: 1  # save mem
+      enable-flashinfer-allreduce-fusion: false # to save mem 
+      enable-symm-mem: false # to save mem
+      moe-dense-tp-size: 1
+      prefill-round-robin-balance: true
+      tensor-parallel-size: 4
+      data-parallel-size: 1
+      expert-parallel-size: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 8102 
+  osl: 1024
+  concurrencies: [4,8]
+  req_rate: "inf"
diff --git a/recipies/gb300-fp8/8k1k/stp/max.yaml b/recipies/gb300-fp8/8k1k/stp/max.yaml
new file mode 100644
index 00000000..ee0ae2f5
--- /dev/null
+++ b/recipies/gb300-fp8/8k1k/stp/max.yaml
@@ -0,0 +1,170 @@
+# GB200 FP8 Max Throughput Configuration
+
+name: "gb300-8k1k-fp8-max"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 12 
+  prefill_workers: 6
+  decode_nodes: 6 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 24 
+      dp-size: 24 
+      ep-size: 24 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 9300
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: [2048,4096,10240]
+  req_rate: "inf"
+
diff --git a/recipies/gb300-fp8/8k1k/stp/mid.yaml b/recipies/gb300-fp8/8k1k/stp/mid.yaml
new file mode 100644
index 00000000..9842c8aa
--- /dev/null
+++ b/recipies/gb300-fp8/8k1k/stp/mid.yaml
@@ -0,0 +1,170 @@
+# GB200 FP8 Max Throughput Configuration
+
+name: "gb300-8k1k-fp8-mid"
+
+model:
+  path: "dsfp8"
+  container: "sglang0p5p8_cu13"
+  precision: "fp8"
+
+extra_mount: # add this if you need to mount extra directories to the container
+  - "/lustre:/lustre"
+resources:
+  gpu_type: "gb300"
+  prefill_nodes: 10 
+  prefill_workers: 5
+  decode_nodes: 8 
+  decode_workers: 1
+  gpus_per_node: 4
+
+backend:
+
+  # Prefill-specific environment variables
+  prefill_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  # Decode-specific environment variables
+  decode_environment:
+    TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
+    SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"
+    DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
+    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "768"
+    MC_TE_METRIC: "true"
+    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
+    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
+    SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
+    SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
+    SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
+    SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
+    MC_FORCE_MNNVL: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_CUMEM_ENABLE: "1"
+    SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
+    SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
+    PYTHONUNBUFFERED: "1"
+
+  sglang_config:
+    prefill:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+
+      # Parallelism
+      tp-size: 8
+      dp-size: 8
+      ep-size: 8 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      max-running-requests: 30000
+      context-length: 9300 
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+      disaggregation-transfer-backend: nixl
+      
+      # Prefill-specific mode
+      disaggregation-mode: "prefill"
+      
+      # Memory and token limits
+      mem-fraction-static: 0.75
+      max-total-tokens: 524288
+      chunked-prefill-size: 131072
+
+      # Request handling
+      load-balance-method: "round_robin"
+
+      # Performance optimizations
+      disable-cuda-graph: true
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "normal"
+      ep-dispatch-algorithm: "dynamic"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+    decode:
+      # Model configuration
+      served-model-name: "deepseek-ai/DeepSeek-R1"
+      skip-tokenizer-init: true
+      trust-remote-code: true
+      disaggregation-transfer-backend: nixl
+
+      # Parallelism
+      tp-size: 32 
+      dp-size: 32 
+      ep-size: 32 
+      enable-dp-attention: true
+
+      # KV cache and attention
+      attention-backend: "trtllm_mla"
+      kv-cache-dtype: "fp8_e4m3"
+
+      # Radix cache disabled
+      disable-radix-cache: true
+
+      # Other flags
+      stream-interval: 50
+      decode-log-interval: 1000
+      max-running-requests: 45000
+      context-length: 9300
+
+      watchdog-timeout: 1000000
+      disable-shared-experts-fusion: true
+      eplb-algorithm: "deepseek"
+      disaggregation-bootstrap-port: 30001
+
+      # Decode-specific mode
+      disaggregation-mode: "decode"
+
+      # Memory and token limits
+      mem-fraction-static: 0.82
+      chunked-prefill-size: 36864
+
+      # DeepEP configuration
+      moe-a2a-backend: "deepep"
+      deepep-mode: "low_latency"
+      ep-dispatch-algorithm: "static"
+      moe-dense-tp-size: 1
+      enable-dp-lm-head: true
+      prefill-round-robin-balance: true
+      ep-num-redundant-experts: 32
+      deepep-config: "/configs/deepep_config.json"
+
+      # CUDA graphs
+      cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768]
+      cuda-graph-max-bs: 768
+
+benchmark:
+  type: "sa-bench"
+  isl: 8192 
+  osl: 1024
+  concurrencies: [128,256,512,1024,2048,4096]
+  req_rate: "inf"
+

From bb7c6171a6b1d5fdea00aed20b209aa1b3c33cce Mon Sep 17 00:00:00 2001
From: Grace Ho <grho@login-lyris01.lyris.clusters.nvidia.com>
Date: Thu, 29 Jan 2026 13:08:18 -0800
Subject: [PATCH 2/2] typos from coderabbit addressed

---
 recipies/gb300-fp8/1k1k/stp/max.yaml         | 2 +-
 recipies/gb300-fp8/1k1k/stp/mid.yaml         | 2 +-
 recipies/gb300-fp8/8k1k/stp/low-latency.yaml | 2 +-
 recipies/gb300-fp8/8k1k/stp/max.yaml         | 2 +-
 recipies/gb300-fp8/8k1k/stp/mid.yaml         | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/recipies/gb300-fp8/1k1k/stp/max.yaml b/recipies/gb300-fp8/1k1k/stp/max.yaml
index 2d7c65dd..664a9875 100644
--- a/recipies/gb300-fp8/1k1k/stp/max.yaml
+++ b/recipies/gb300-fp8/1k1k/stp/max.yaml
@@ -1,4 +1,4 @@
-# GB200 FP8 Max Throughput Configuration
+# GB300 FP8 Max Throughput Configuration
 
 name: "gb300-1k1k-fp8-max"
 
diff --git a/recipies/gb300-fp8/1k1k/stp/mid.yaml b/recipies/gb300-fp8/1k1k/stp/mid.yaml
index 77df3500..5131c692 100644
--- a/recipies/gb300-fp8/1k1k/stp/mid.yaml
+++ b/recipies/gb300-fp8/1k1k/stp/mid.yaml
@@ -1,4 +1,4 @@
-# GB200 FP8 Max Throughput Configuration
+# GB300 FP8 Mid Throughput Configuration
 name: "gb300-1k1k-fp8-mid"
 
 model:
diff --git a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml
index 27300e8b..07a7d781 100644
--- a/recipies/gb300-fp8/8k1k/stp/low-latency.yaml
+++ b/recipies/gb300-fp8/8k1k/stp/low-latency.yaml
@@ -117,7 +117,7 @@ backend:
 
 benchmark:
   type: "sa-bench"
-  isl: 8102 
+  isl: 8192 
   osl: 1024
   concurrencies: [4,8]
   req_rate: "inf"
diff --git a/recipies/gb300-fp8/8k1k/stp/max.yaml b/recipies/gb300-fp8/8k1k/stp/max.yaml
index ee0ae2f5..010bed5e 100644
--- a/recipies/gb300-fp8/8k1k/stp/max.yaml
+++ b/recipies/gb300-fp8/8k1k/stp/max.yaml
@@ -1,4 +1,4 @@
-# GB200 FP8 Max Throughput Configuration
+# GB300 FP8 Max Throughput Configuration
 
 name: "gb300-8k1k-fp8-max"
 
diff --git a/recipies/gb300-fp8/8k1k/stp/mid.yaml b/recipies/gb300-fp8/8k1k/stp/mid.yaml
index 9842c8aa..f7f16b54 100644
--- a/recipies/gb300-fp8/8k1k/stp/mid.yaml
+++ b/recipies/gb300-fp8/8k1k/stp/mid.yaml
@@ -1,4 +1,4 @@
-# GB200 FP8 Max Throughput Configuration
+# GB300 FP8 Mid Throughput Configuration
 
 name: "gb300-8k1k-fp8-mid"