Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,8 @@ configs/*.tar.gz
.ruff_cache/
*.egg-info/

.coverage
.coverage

configs/dg-*
configs/flashinfer-cache/
outputs/*
20 changes: 11 additions & 9 deletions recipies/gb200-fp4/1k8k/low-latency.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
name: "gb200-fp4-1p2d"

dynamo:
version: 0.7.0

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 4

model:
path: "dsr1"
container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
container: "lmsysorg/sglang:v0.5.5.post2"
precision: "fp4"

resources:
Expand All @@ -24,8 +32,6 @@ backend:
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
#SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
#SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
Expand All @@ -43,8 +49,6 @@ backend:
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
# SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
# SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
Expand All @@ -64,7 +68,7 @@ backend:
moe-runner-backend: "flashinfer_trtllm"
stream-interval: 10
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
mem-fraction-static: 0.95
max-total-tokens: 8192
chunked-prefill-size: 8192
Expand All @@ -77,7 +81,6 @@ backend:
data-parallel-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
disaggregation-transfer-backend: nixl

decode:
disaggregation-mode: "decode"
Expand All @@ -92,15 +95,14 @@ backend:
disaggregation-bootstrap-port: 30001
stream-interval: 10
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
mem-fraction-static: 0.95
chunked-prefill-size: 8192
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
disaggregation-transfer-backend: nixl

benchmark:
type: "sa-bench"
Expand Down
20 changes: 12 additions & 8 deletions recipies/gb200-fp4/1k8k/max-tpt.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# 4P1D, with 12 Decode Nodes. Uses single batch overlap

name: "gb200-fp4-max-tpt"

dynamo:
version: 0.7.0

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 9

model:
path: "dsr1"
container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
container: "lmsysorg/sglang:v0.5.5.post2"
precision: "fp4"

resources:
Expand Down Expand Up @@ -56,13 +62,13 @@ backend:
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
disaggregation-transfer-backend: nixl

# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
Expand All @@ -80,7 +86,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand Down Expand Up @@ -112,7 +118,6 @@ backend:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
trust-remote-code: true
disaggregation-transfer-backend: nixl

# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
Expand All @@ -130,7 +135,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand Down Expand Up @@ -228,7 +233,6 @@ backend:
enable-dp-lm-head: true
prefill-round-robin-balance: true
enable-dp-attention: true
fp4-gemm-backend: "flashinfer_cutlass"

# Parallelism
tp-size: 48
Expand Down
21 changes: 12 additions & 9 deletions recipies/gb200-fp4/1k8k/mid-curve.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
# 4P1D, with 8 Decode Nodes. Does not use single batch overlap but allows us to currently drive higher
# per gpu throughput
name: "gb200-fp4-mid-curve"

name: "gb200-fp4-max-tpt-2"
dynamo:
version: 0.7.0

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 9

model:
path: "dsr1"
container: "lmsysorg/sglang:nightly-dev-cu13-20260121-1e309030"
container: "lmsysorg/sglang:v0.5.5.post2"
precision: "fp4"

resources:
Expand Down Expand Up @@ -57,6 +62,7 @@ backend:
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_MOE_NVFP4_DISPATCH: "1"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1" # Used in older sglang versions
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
prefill:
Expand All @@ -67,7 +73,6 @@ backend:
# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"
disaggregation-transfer-backend: nixl

# Quantization
quantization: "modelopt_fp4"
Expand All @@ -81,7 +86,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand Down Expand Up @@ -117,7 +122,6 @@ backend:
# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"
disaggregation-transfer-backend: nixl

# Quantization
quantization: "modelopt_fp4"
Expand All @@ -131,7 +135,7 @@ backend:
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 9200
context-length: 10000
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001
Expand Down Expand Up @@ -228,7 +232,6 @@ backend:
enable-dp-lm-head: true
prefill-round-robin-balance: true
enable-dp-attention: true
fp4-gemm-backend: "flashinfer_cutlass"

# Parallelism
tp-size: 32
Expand Down
59 changes: 59 additions & 0 deletions recipies/h200/1k1k/bs128-agg-tp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: "agg-tp-h200-fp8"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h200"
agg_nodes: 1
agg_workers: 1
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"

# Decode-specific environment variables
decode_environment:
SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"

sglang_config:
aggregated:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 8
dp-size: 1

# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
stream-interval: 10
max-running-requests: 512 # sum of all dp

# Memory and token limits
mem-fraction-static: 0.82
max-prefill-tokens: 32768
chunked-prefill-size: 32768

# CUDA graphs
cuda-graph-max-bs: 512

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x16x32x64x128x256x512"
req_rate: "inf"
100 changes: 100 additions & 0 deletions recipies/h200/1k1k/bs256-1p6d-dep.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: "bs256-1p6d-h200-fp8"

model:
path: "dsfp8"
container: "lmsysorg/sglang:v0.5.8-cu130-runtime"
precision: "fp8"

resources:
gpu_type: "h200"
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 6
decode_workers: 6
gpus_per_node: 8

backend:

# Prefill-specific environment variables
prefill_environment:
SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"

# Decode-specific environment variables
decode_environment:
SGLANG_DG_CACHE_DIR: "/configs/dg-10212025"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true
# KV cache and attention
attention-backend: "flashinfer"

# Radix cache disabled
disable-radix-cache: true

# Other flags
# stream-interval: 50
max-running-requests: 512


# Prefill-specific mode
disaggregation-bootstrap-port: 30001
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.75
max-prefill-tokens: 65536
chunked-prefill-size: 262144

# Request handling
load-balance-method: "round_robin"


decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
skip-tokenizer-init: true
trust-remote-code: true

# Parallelism
tp-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true

# KV cache and attention
attention-backend: "flashinfer"

# Other flags
disable-radix-cache: true
stream-interval: 10

# Disagg
disaggregation-bootstrap-port: 30001
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.82
max-running-requests: 512
cuda-graph-max-bs: 512

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x512x1024x2048"
req_rate: "inf"

Loading
Loading