Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
73 commits
Select commit Hold shift + click to select a range
93db2e2
Day 0 DeepSeek V4 Pro FP4 GB200 disaggregated SGLang benchmarks
Oseltamivir Apr 25, 2026
1bc4c2e
Drop unsupported backend.connector field from sglang recipes
Oseltamivir Apr 25, 2026
c0d477d
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 25, 2026
65b8b17
Drop dynamo: version: 0.8.1 — incompatible with deepseek-v4-grace-bla…
Oseltamivir Apr 25, 2026
9d883ba
Add dynamo: install: false — srtctl default is install=True
Oseltamivir Apr 25, 2026
1b75dd7
Pin dynamo to v1.2.0-sglang-deepseek-v4-dev.1 tag (hash 21f135f5)
Oseltamivir Apr 25, 2026
eb3f62c
Force deepep-mode: low_latency to work around mxfp4+DeepEP normal-dis…
Oseltamivir Apr 25, 2026
6c608df
Drop DeepEP / DP-attn / EP — fork-only mxfp4_deepseek bug, both dispa…
Oseltamivir Apr 25, 2026
2bb3ef0
Add moe-dense-tp-size: 1 — fix shared-experts FP8 block-quant divisib…
Oseltamivir Apr 25, 2026
d34d894
Set SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=1024 in all env bl…
Oseltamivir Apr 25, 2026
c24f25b
Switch to TP=4 single-node — match PR #75 verbatim, fix FP8 block-quant
Oseltamivir Apr 25, 2026
c0aec93
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 25, 2026
8316d3f
Restore mi355x retry changelog entries clobbered by merge
Oseltamivir Apr 25, 2026
f089567
Switch back to TP=8: enable-dp-attention + moe-dense-tp-size: 1, no m…
Oseltamivir Apr 26, 2026
34e4a92
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
5b6eb2f
Scope sweep to high-conc DeepEP only — temporarily comment 1p1d blocks
Oseltamivir Apr 26, 2026
b913586
tep fix + dep for high conc
Oseltamivir Apr 26, 2026
bca99eb
sike no dpa
Oseltamivir Apr 26, 2026
6c09973
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
5866658
Cap SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK at 1024 — sglang L…
Oseltamivir Apr 26, 2026
c0fc3bb
Revert 3p1d-dep8-dep16 to no-DeepEP TP-only; uncomment full 1k/1k + 8…
Oseltamivir Apr 26, 2026
0526fa0
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 26, 2026
30c2512
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
bc9fccf
Try moe-a2a-backend: flashinfer on 3p1d-dep8-dep16 for high-conc EP
Oseltamivir Apr 27, 2026
8ea8e77
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
e6d8943
Revert flashinfer EP attempt — accept TP-only pareto, every EP backen…
Oseltamivir Apr 27, 2026
90304df
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
1d27533
fix(sglang): bump 8k1k prefill max-running-requests from 4 to 8
Oseltamivir Apr 27, 2026
a172069
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 27, 2026
df1c783
ports
Oseltamivir Apr 28, 2026
513cbef
Dsv4 fp4 gb200 dynamo sglang disagg (#1213)
ch-wan Apr 28, 2026
fa876e3
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
Oseltamivir Apr 28, 2026
b27c8da
adapt for model path, etc
Oseltamivir Apr 28, 2026
0dbc9a4
dev
ch-wan Apr 28, 2026
ba72558
upd
ch-wan Apr 28, 2026
7c81fe9
fix
ch-wan Apr 28, 2026
7a1daaf
fix
ch-wan Apr 28, 2026
8ce4965
Merge branch 'main' into dsv4-fp4-gb200-dynamo-sglang-disagg
ch-wan Apr 28, 2026
c454ad3
test
ch-wan Apr 28, 2026
bac301d
add gb300
ch-wan Apr 28, 2026
1167f64
upd
ch-wan Apr 28, 2026
cfae9ae
fix
ch-wan Apr 28, 2026
8aa71cd
Merge commit '06596136c1e0115106ed051af12ca630796b228e' into dsv4-fp4…
ch-wan Apr 28, 2026
0443a1f
fix
ch-wan Apr 28, 2026
387726d
fix
ch-wan Apr 29, 2026
fe6815c
fix(launch_gb300-cw): register deepseek-v4-pro alias in model_paths
ch-wan Apr 29, 2026
b4d6c19
fix(launch_gb300-cw): pull arm64 squash and force fresh import per ru…
ch-wan Apr 29, 2026
cad94c9
fix(launch_gb300-cw): use enroot --arch aarch64, not arm64
ch-wan Apr 29, 2026
d6fc0e7
fix(launch_gb300-cw): use pre-staged arm64 sqsh, drop in-CI enroot im…
ch-wan Apr 29, 2026
da6f892
fix(launch_gb300-cw): persist dynamo wheel cache and ulimit preamble
ch-wan Apr 29, 2026
28d03e8
fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144 for dynamo build
ch-wan Apr 29, 2026
16113f8
fix(sglang/dsv4/8k1k recipes): set cpus-per-task=144 and mem=0
ch-wan Apr 29, 2026
ade5488
fix(launch_gb300-cw): pin srt-slurm fork with parallel sa-bench
ch-wan Apr 29, 2026
b19eb9a
merge: origin/main into dsv4-fp4-gb200-dynamo-sglang-disagg
ch-wan Apr 29, 2026
152a059
fix(launch_gb300-cw): bump srt-slurm fork pin to minimal multiproc patch
fzyzcjy Apr 29, 2026
c435a65
ci: temporarily comment out conc-list:[64] 2p1d entry
fzyzcjy Apr 29, 2026
be12dba
ci(eval): temporarily skip dsv4-fp4-gb300 dynamo-sglang eval-only entry
fzyzcjy Apr 29, 2026
38acd77
bench(7p1d-dep4-dep8): swap sa-bench default for yangminl's gb300-cw …
fzyzcjy Apr 29, 2026
22c5e67
config(7p1d-dep4-dep8): align with job 564 — multi-frontend, sbatch d…
fzyzcjy Apr 29, 2026
15423f1
config(7p1d-dep4-dep8): keep PR name field, revert to original
fzyzcjy Apr 29, 2026
cba5297
Merge remote-tracking branch 'origin/main' into dsv4-fp4-gb200-dynamo…
fzyzcjy Apr 29, 2026
a1a6f8d
upd
ch-wan Apr 29, 2026
b146b86
fix
ch-wan Apr 29, 2026
f521e2e
Merge commit '3cfb0b9620ad1f11f9d9412409fb2f67a757c3d7' into dsv4-fp4…
ch-wan Apr 29, 2026
c843c0d
fix
ch-wan Apr 29, 2026
927edfe
middle
ch-wan Apr 29, 2026
c14d06d
fi
ch-wan Apr 29, 2026
7d977cf
Merge commit '182c80aaecb80fc79a074cc38876235a32013bcd' into dsv4-fp4…
ch-wan Apr 29, 2026
5e86ffc
fix
ch-wan Apr 29, 2026
5776fd5
upd
ch-wan Apr 30, 2026
b472c78
Merge commit '49651ae6b535c4df02e132d2a9877eb2a5c6ca30' into dsv4-fp4…
ch-wan Apr 30, 2026
fce13d0
fix
ch-wan Apr 30, 2026
484763a
upd
ch-wan Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7666,3 +7666,115 @@ dsv4-fp4-gb200-dynamo-vllm:
tp: 16
ep: 16
dp-attn: true

dsv4-fp4-gb200-dynamo-sglang:
image: lmsysorg/sglang:deepseek-v4-grace-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb200
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
seq-len-configs:
# 1k/1k — hand-rolled. NVIDIA/srt-slurm has no DSV4 sglang disagg
# recipe yet; topologies match the dsv4-fp4-gb200-dynamo-vllm sibling
# so framework-level numbers are directly comparable. Per-worker
# tunings cross-reference benchmarks/single_node/dsv4_fp4_b200.sh and
# NVIDIA/srt-slurm@sa-submission-q2-2026 recipes/gb200-fp4/1k1k/*.yaml
# (DSR1 sglang disagg structure).
- isl: 1024
osl: 1024
search-space:
# Low-concurrency / interactivity: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
- conc-list: [1, 4, 8, 16, 32, 64]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
decode:
num-worker: 1
tp: 8
ep: 1
dp-attn: false
# Mid throughput: 1 prefill (DP=8) + 1 wide decode (DP=16). 6 nodes.
- conc-list: [128, 256, 1024, 2048, 4096]
prefill:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-1p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# High throughput: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
# 4096 overlap with the 1p1d block gives a topology-crossover A/B.
- conc-list: [4096, 8192]
prefill:
num-worker: 3
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true

# 8k/1k block kept commented out — same rationale as the dsv4-fp4-
# gb200-dynamo-vllm sibling: keep `sweep-enabled` runtime bounded.
# Uncomment to re-enable (recipes are already in place).
# - isl: 8192
# osl: 1024
# search-space:
# # Low-concurrency: 1 prefill (DP=8) + 1 decode (TP=8). 4 nodes.
# - conc-list: [1, 4, 8, 16, 32, 64]
# prefill:
# num-worker: 1
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-1p1d-dep8-tep8.yaml"
# decode:
# num-worker: 1
# tp: 8
# ep: 1
# dp-attn: false
# # Mid: 3 prefills (DP=8) + 1 wide decode (DP=16). 10 nodes.
# - conc-list: [512, 1024]
# prefill:
# num-worker: 3
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-3p1d-dep8-dep16.yaml"
# decode:
# num-worker: 1
# tp: 16
# ep: 16
# dp-attn: true
# # Max throughput: 7 prefills (DP=8) + 1 wide decode (DP=16). 18 nodes.
# - conc-list: [4096, 8192]
# prefill:
# num-worker: 7
# tp: 8
# ep: 8
# dp-attn: true
# additional-settings:
# - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-7p1d-dep8-dep16.yaml"
# decode:
# num-worker: 1
# tp: 16
# ep: 16
# dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
name: "dsv4-sglang-disagg-gb200-1p1d-dep8-dep16"

# Hand-rolled — see ./disagg-gb200-1p1d-dep8-tep8.yaml header for the
# upstream-reference list (PR #69 GB200 agg, PR #75 GB300 disagg).
# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling.
#
# Topology: 1 prefill (DP=8 EP=8) + 1 decode (DP=16 EP=16). 6 nodes.
# Single prefill is enough for 1k prompts up to ~conc 4096 (per-rank
# prefill TFlops at 1k ISL is high; matches the vLLM sibling sizing).

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
precision: "fp4"

dynamo:
version: 0.8.1

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 4
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 16

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang
connector: null

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true
moe-a2a-backend: "deepep"
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 16
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 16
dp-size: 16
ep-size: 16
enable-dp-attention: true
moe-a2a-backend: "deepep"
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 512
cuda-graph-max-bs: 512
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "128x256x1024x2048x4096"
req_rate: "inf"
use_chat_template: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: "dsv4-sglang-disagg-gb200-1p1d-dep8-tep8"

# Hand-rolled — no GB200 DSV4 sglang disagg recipe exists upstream. The
# closest references on NVIDIA/srt-slurm are:
# * PR #69 (recipes/gb200-fp4/1k1k-dsv4/agg-2n-low-latency.yaml) —
# GB200 DSV4 sglang AGGREGATED: per-worker flag set + env vars.
# * PR #75 (recipes/gb300-fp4/1k1k-dsv4/disagg-1p1d-tp4-mxfp4.yaml) —
# GB300 DSV4 sglang DISAGG: confirms nixl + flashinfer_mxfp4 +
# chunked-prefill-size=4096 + disable-flashinfer-autotune.
# Topology mirrors the dsv4-fp4-gb200-dynamo-vllm sibling so cross-
# framework numbers stay directly comparable.
#
# Topology: 1 prefill (DP=8 EP=8) + 1 decode (TP=8, no DP-attn). 4 nodes.
# Targets very low concurrency (1-64) where TP-sharded decode gives the
# best per-user latency.

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:deepseek-v4-grace-blackwell"
precision: "fp4"

dynamo:
version: 0.8.1

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb200"
gpus_per_node: 4
prefill_nodes: 2
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 8
gpus_per_decode: 8

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: sglang
connector: null

# Env var set mirrored from PR #69 (the GB200 DSV4 aggregated baseline
# that's actually been run upstream) plus the disaggregation timeout
# triple — heartbeat 100k matches the DSR1 sglang disagg convention.
prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
dp-size: 8
ep-size: 8
enable-dp-attention: true
moe-a2a-backend: "deepep"
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 16
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "prefill"
disaggregation-transfer-backend: nixl

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
trust-remote-code: true
tensor-parallel-size: 8
moe-runner-backend: "flashinfer_mxfp4"
chunked-prefill-size: 4096
disable-flashinfer-autotune: true
disable-radix-cache: true
mem-fraction-static: 0.82
context-length: 3072
max-running-requests: 64
cuda-graph-max-bs: 64
stream-interval: 50
decode-log-interval: 1000
disaggregation-mode: "decode"
disaggregation-transfer-backend: nixl

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x4x8x16x32x64"
req_rate: "inf"
use_chat_template: false
Loading
Loading