Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ce53cf1
add mtp configs
ch-wan May 8, 2026
debb7b9
Add sbatch_directives to MTP recipes (root-cause fix)
ch-wan May 8, 2026
59f899d
Change deepgemm flags
Fridge003 May 8, 2026
5f02885
Move MTP recipes up to 8k1k/ with -mtp filename suffix
ch-wan May 8, 2026
50c6c59
fix
ch-wan May 8, 2026
4bceea3
Drop custom_tokenizer from MTP recipes — incompatible with sa-bench
ch-wan May 8, 2026
8738f93
Merge branch 'main' into sglang-disagg-gb300-mtp-0507
ch-wan May 8, 2026
e1a5081
Pin srt-slurm to fork w/ SGLangDeepseekV4Tokenizer callable + restore…
ch-wan May 8, 2026
530762b
Bump sglang container to nightly-dev-cu13-20260508-2cf1a4ab (latest m…
ch-wan May 8, 2026
6db9e2c
Restore base dsv4-fp4-gb300-dynamo-sglang image to staging tag
ch-wan May 8, 2026
164f5a2
Pin MTP recipes to dynamo 81d0555e (matches working base recipes)
ch-wan May 8, 2026
6d28994
Explicitly disable CAR_V2 in multi-node decode MTP recipes
ch-wan May 8, 2026
9c4c244
Explicitly disable CAR_V2 in 8k1k base decode recipes too
ch-wan May 8, 2026
9814b42
Set both old and new sglang thinking/reasoning env vars in MTP recipes
ch-wan May 8, 2026
3e049e8
Set tool-call-parser=deepseekv4 to enable DSV4 chat encoding (gsm8k r…
ch-wan May 9, 2026
255e7fb
Revert CAR_V2 explicit-disable in non-MTP base 8k1k recipes
ch-wan May 9, 2026
cb59807
Trim verbose comments and drop deprecated env var names in MTP recipes
ch-wan May 9, 2026
9ff03f2
Revert MTP recipes to staging-dev container (gsm8k accuracy fix)
ch-wan May 9, 2026
9b06113
Bump dynamo hash to 34d55a5 to fix DSV4 chat-template formatter
ch-wan May 9, 2026
36bf040
Bump sglang container to nightly-dev-cu13-20260509-9ee83034
ch-wan May 9, 2026
3275282
Switch DSV4 MTP recipes to nixl KV transfer backend
ch-wan May 9, 2026
1ffcab9
Merge remote-tracking branch 'origin/main' into sglang-disagg-gb300-m…
ch-wan May 9, 2026
daa6785
Revert "Switch DSV4 MTP recipes to nixl KV transfer backend"
ch-wan May 10, 2026
072e2ee
Bump MTP recipes to sglang nightly with mooncake DSv4 fix
ch-wan May 10, 2026
eae8d32
Merge remote-tracking branch 'origin/main' into sglang-disagg-gb300-m…
ch-wan May 10, 2026
6ff6545
gb300-cw: switch srt-slurm pin to NVIDIA/srt-slurm main (#144 merged)
ch-wan May 10, 2026
cb45485
gb300-cw: track NVIDIA/srt-slurm main instead of pinning a commit
ch-wan May 10, 2026
79d2cb6
Bump MTP recipes to sglang nightly 20260510-2473659e
ch-wan May 10, 2026
32e623d
Merge remote-tracking branch 'origin/main' into sglang-disagg-gb300-m…
ch-wan May 10, 2026
35a2f9a
fix: use shared gb300 dsv4 model path
Oseltamivir May 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8425,3 +8425,110 @@ dsv4-fp4-gb300-dynamo-sglang:
tp: 12
ep: 12
dp-attn: true

# MTP variant of dsv4-fp4-gb300-dynamo-sglang.
dsv4-fp4-gb300-dynamo-sglang-mtp:
image: lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-cw
precision: fp4
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
# Low-latency baseline: 1p1d-tp4-tp4. 2 nodes.
- spec-decoding: "mtp"
conc-list: [1]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false
# Low-latency 1p6d-dep4-tp4: 1P (DEP=4) + 6 TP=4 decode workers. 7 nodes.
# Recipe runs concurrencies=8x32x64; matrix tracks the max.
- spec-decoding: "mtp"
conc-list: [64]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml"
decode:
num-worker: 6
tp: 4
ep: 1
dp-attn: false
# Mid curve 1p1d-dep4-dep8. 3 nodes.
- spec-decoding: "mtp"
conc-list: [256]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# Mid curve 1p1d-dep4-dep16. 5 nodes.
- spec-decoding: "mtp"
conc-list: [256]
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
# Mid curve 2p1d-dep4-dep8. 4 nodes.
- spec-decoding: "mtp"
conc-list: [512]
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
# Mid curve 4p1d-dep4-dep8. 6 nodes.
- spec-decoding: "mtp"
conc-list: [1024]
prefill:
num-worker: 4
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p1d-tp4-tp4-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
precision: "mxfp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 1
decode_workers: 1

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
# is single-node only and corrupts results in 2-node decode setups.

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 8
cuda-graph-max-bs: 8
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "1"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
name: "dsv4-pro-gb300-disagg-8k1k-low-latency-1p6d-dep4-tp4-mtp"

frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 8

dynamo:
hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c"
install: true

model:
path: "deepseek-v4-pro"
container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e"
precision: "mxfp4"

sbatch_directives:
cpus-per-task: "144"
mem: "0"

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
prefill_workers: 1
decode_nodes: 6
decode_workers: 6

backend:
type: sglang

prefill_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"

SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1"
SGLANG_OPT_USE_FAST_MASK_EP: "1"
SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1"
SGLANG_OPT_FIX_HASH_MEGA_MOE: "1"
SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216"
SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1"
SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0"

NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"

decode_environment:
PYTHONUNBUFFERED: "1"
SGLANG_RADIX_DISABLE_REUSE: "1"
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
SGLANG_DEFAULT_THINKING: "1"
SGLANG_DSV4_REASONING_EFFORT: "max"
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
SGLANG_OPT_USE_JIT_NORM: "1"
SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1"
SGLANG_OPT_USE_TOPK_V2: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
MC_FORCE_MNNVL: "1"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
# SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2
# is single-node only and corrupts results in 2-node decode setups.

sglang_config:
prefill:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.

disaggregation-mode: "prefill"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 4

enable-dp-attention: true
enable-dp-lm-head: true

moe-a2a-backend: "deepep"
deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

mem-fraction-static: 0.9
max-running-requests: 128
cuda-graph-max-bs: 128
chunked-prefill-size: 32768

decode:
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
model-path: "/model/"
trust-remote-code: true
tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec.

disaggregation-mode: "decode"
disaggregation-transfer-backend: mooncake

tensor-parallel-size: 4
data-parallel-size: 1
expert-parallel-size: 1

moe-runner-backend: "flashinfer_mxfp4"
disable-flashinfer-autotune: true

speculative-algo: "EAGLE"
speculative-num-steps: 3
speculative-eagle-topk: 1
speculative-num-draft-tokens: 4

mem-fraction-static: 0.9
max-running-requests: 128
cuda-graph-max-bs: 128
swa-full-tokens-ratio: 0.1
context-length: 16384

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
random_range_ratio: 0.8
concurrencies: "8x32x64"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
Loading
Loading