Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions recipies/gb200-fp4/1k1k/max-tpt-mtp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ backend:
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1"
SGLANG_ENABLE_SPEC_V2: "1"
SGLANG_NCCL_ALL_GATHER_IN_OVERLAP_SCHEDULER_SYNC_BATCH: "1"
Expand Down Expand Up @@ -65,6 +66,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"
SGLANG_NVFP4_CKPT_FP8_NEXTN_MOE: "1"
SGLANG_ENABLE_SPEC_V2: "1"
Expand Down
1 change: 1 addition & 0 deletions recipies/gb200-fp4/1k1k/max-tpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
Expand Down
3 changes: 2 additions & 1 deletion recipies/gb200-fp4/1k1k/mid-curve.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
# per gpu throughput

name: "gb200-fp4-max-tpt-2"
Expand Down Expand Up @@ -57,6 +57,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
Expand Down
1 change: 1 addition & 0 deletions recipies/gb200-fp4/1k8k/max-tpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions

sglang_config:
prefill:
Expand Down
1 change: 1 addition & 0 deletions recipies/gb200-fp4/1k8k/mid-curve.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions

sglang_config:
prefill:
Expand Down
35 changes: 18 additions & 17 deletions recipies/gb200-fp4/8k1k/max-tpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ model:

resources:
gpu_type: "gb200"
prefill_nodes: 10
decode_nodes: 8
prefill_workers: 10
prefill_nodes: 10
decode_nodes: 8
prefill_workers: 10
decode_workers: 1
gpus_per_node: 4

Expand Down Expand Up @@ -54,6 +54,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
Expand All @@ -79,7 +80,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9600
context-length: 9600
disable-shared-experts-fusion: true
disaggregation-bootstrap-port: 30001

Expand All @@ -89,22 +90,22 @@ backend:
# Memory and token limits
mem-fraction-static: 0.95
max-total-tokens: 131072
max-prefill-tokens: 524288
chunked-prefill-size: 131072
max-prefill-tokens: 524288
chunked-prefill-size: 131072

# Request handling
max-running-requests: 30000
load-balance-method: "round_robin"

# Performance optimizations
disable-cuda-graph: true
enable-dp-attention: false
enable-dp-attention: false

# Parallelism
tp-size: 4
dp-size: 1
ep-size: 1

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
Expand All @@ -127,7 +128,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9600
context-length: 9600
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand All @@ -137,11 +138,11 @@ backend:

# Memory and token limits
mem-fraction-static: 0.83
max-total-tokens: 524288
chunked-prefill-size: 24576
max-total-tokens: 524288
chunked-prefill-size: 24576

# Request handling
max-running-requests: 16384
max-running-requests: 16384

# DeepEP configuration
moe-a2a-backend: "deepep"
Expand All @@ -159,13 +160,13 @@ backend:
enable-dp-attention: true

# Parallelism
tp-size: 32
dp-size: 32
ep-size: 32
tp-size: 32
dp-size: 32
ep-size: 32

benchmark:
type: "sa-bench"
isl: 8192
isl: 8192
osl: 1024
concurrencies: "1024x2048x8192"
req_rate: 700
req_rate: 700
31 changes: 16 additions & 15 deletions recipies/gb200-fp4/8k1k/mid-curve.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ model:
resources:
gpu_type: "gb200"
prefill_nodes: 6
decode_nodes: 12
decode_nodes: 12
prefill_workers: 6
decode_workers: 1
gpus_per_node: 4
Expand Down Expand Up @@ -54,6 +54,7 @@ backend:
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "512"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
Expand All @@ -79,7 +80,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9600
context-length: 9600
disable-shared-experts-fusion: true
disaggregation-bootstrap-port: 30001

Expand All @@ -89,22 +90,22 @@ backend:
# Memory and token limits
mem-fraction-static: 0.95
max-total-tokens: 131072
max-prefill-tokens: 524288
chunked-prefill-size: 131072
max-prefill-tokens: 524288
chunked-prefill-size: 131072

# Request handling
max-running-requests: 30000
load-balance-method: "round_robin"

# Performance optimizations
disable-cuda-graph: true
enable-dp-attention: false
enable-dp-attention: false

# Parallelism
tp-size: 4
dp-size: 1
ep-size: 1

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
Expand All @@ -127,7 +128,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9600
context-length: 9600
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand All @@ -137,11 +138,11 @@ backend:

# Memory and token limits
mem-fraction-static: 0.83
max-total-tokens: 524288
chunked-prefill-size: 24576
max-total-tokens: 524288
chunked-prefill-size: 24576

# Request handling
max-running-requests: 16384
max-running-requests: 16384

# DeepEP configuration
moe-a2a-backend: "deepep"
Expand All @@ -159,13 +160,13 @@ backend:
enable-dp-attention: true

# Parallelism
tp-size: 48
dp-size: 48
ep-size: 48
tp-size: 48
dp-size: 48
ep-size: 48

benchmark:
type: "sa-bench"
isl: 8192
isl: 8192
osl: 1024
concurrencies: "512x1024x2048x4096"
req_rate: 700
req_rate: 700