Skip to content
This repository was archived by the owner on Apr 20, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions configs/checkout-branch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we rename this file to gb200-fp4-mtp-setup.sh

i dont have a good way of organizing these lol so descriptive naming is probably the best

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about checkout-pr-13115.sh?
I also found out we need to disable the engine patch since this PR is based on main. I tried rebasing to 0.5.5 but it might have some dependencies on other PRs.

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I can assist with this tomorrow

BRANCH="dev/mtp_support_for_eagle_worker_v1"

cd /sgl-workspace/sglang
git remote add trevor https://github.com/trevor-m/sglang.git
git fetch trevor $BRANCH
git checkout "trevor/${BRANCH}"

121 changes: 121 additions & 0 deletions recipies/gb200-fp4/1p2d-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
name: "gb200-fp4-1p2d-mtp"

model:
path: "dsfp4"
container: "0.5.5.post2"
precision: "fp4"

resources:
gpu_type: "gb200"
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 2
gpus_per_node: 4

backend:

prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
#SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
#SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_ENABLE_FLASHINFER_GEMM: "true" #instead of SGLANG_FLASHINFER_FP4_GEMM_BACKEND

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_DECODE_BOOTSTRAP_TIMEOUT: "1000"
# SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
# SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_ENABLE_JIT_DEEPGEMM: "false"
SGLANG_ENABLE_FLASHINFER_GEMM: "true" #instead of SGLANG_FLASHINFER_FP4_GEMM_BACKEND

sglang_config:
prefill:
disaggregation-mode: "prefill"
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
trust-remote-code: true
disable-radix-cache: true
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"
quantization: "modelopt_fp4"
moe-runner-backend: "flashinfer_trtllm"
stream-interval: 10
watchdog-timeout: 1000000
context-length: 2200
mem-fraction-static: 0.95
max-total-tokens: 8192
chunked-prefill-size: 8192
cuda-graph-max-bs: 256
max-running-requests: 512
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
load-balance-method: "round_robin"
disaggregation-bootstrap-port: 30001
data-parallel-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

decode:
disaggregation-mode: "decode"
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
prefill-round-robin-balance: true
trust-remote-code: true
disable-radix-cache: true
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"
quantization: "modelopt_fp4"
moe-runner-backend: "flashinfer_trtllm"
disaggregation-bootstrap-port: 30001
stream-interval: 10
watchdog-timeout: 1000000
context-length: 2200
mem-fraction-static: 0.90
chunked-prefill-size: 8192
cuda-graph-max-bs: 256
scheduler-recv-interval: 10
enable-symm-mem: true
moe-dense-tp-size: 1
tensor-parallel-size: 4
expert-parallel-size: 1
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "4x8x32x64x112x128x256"
req_rate: "inf"
192 changes: 192 additions & 0 deletions recipies/gb200-fp4/max-tpt-2-mtp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
# 4P1D, with 8 Decode Nodes. Better per-gpu throughput with worse latency.

name: "gb200-fp4-max-tpt-2-mtp"

model:
path: "dsfp4"
container: "0.5.5.post2"
precision: "fp4"

resources:
gpu_type: "gb200"
prefill_nodes: 4
decode_nodes: 8
prefill_workers: 4
decode_workers: 1
gpus_per_node: 4

backend:

# Prefill-specific environment variables
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
MC_TE_METRIC: "true"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"

# Decode-specific environment variables
decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: "1800"
PYTHONUNBUFFERED: "1"
DYN_SKIP_SGLANG_LOG_FORMATTING: "1"
SGLANG_NVFP4_CKPT_FP8_GEMM_IN_ATTN: "1"
SGLANG_PER_TOKEN_GROUP_QUANT_8BIT_V2: "1"
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: "100000"
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
SGLANG_HACK_SEQ_BOOTSTRAP_ROOM: "1"
MC_TE_METRIC: "true"
MC_FORCE_MNNVL: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_CUMEM_ENABLE: "1"
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: "0"
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: "1"
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "1024"
SGLANG_CUTEDSL_MOE_NVFP4_DISPATCH: "1"
SGLANG_FLASHINFER_FP4_GEMM_BACKEND: "cutlass"

sglang_config:
prefill:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
trust-remote-code: true

# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"

# Quantization
quantization: "modelopt_fp4"
moe-runner-backend: "flashinfer_cutlass"

# Radix cache disabled
disable-radix-cache: true
disable-chunked-prefix-cache: true

# Other flags
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 2176
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001

# Prefill-specific mode
disaggregation-mode: "prefill"

# Memory and token limits
mem-fraction-static: 0.84
max-total-tokens: 131072
max-prefill-tokens: 32768
chunked-prefill-size: 65536
#enable-single-batch-overlap: true

# Request handling
max-running-requests: 30000
load-balance-method: "round_robin"

# Performance optimizations
disable-cuda-graph: true
enable-dp-attention: true

# Parallelism
tp-size: 4
dp-size: 4
ep-size: 4

# MTP
speculative-algorithm: "EAGLE"
speculative-num-steps: 1
speculative-eagle-topk: 1
speculative-num-draft-tokens: 2
speculative-moe-runner-backend: "cutlass"
speculative-moe-a2a-backend: "none"

decode:
# Model configuration
served-model-name: "deepseek-ai/DeepSeek-R1"
model-path: "/model/"
trust-remote-code: true

# KV cache and attention
kv-cache-dtype: "fp8_e4m3"
attention-backend: "trtllm_mla"

# Quantization
quantization: "modelopt_fp4"
moe-runner-backend: "flashinfer_cutedsl"

# Radix cache disabled
disable-radix-cache: true
disable-chunked-prefix-cache: true

# Other flags
stream-interval: 50
decode-log-interval: 1000
watchdog-timeout: 1000000
context-length: 2176
disable-shared-experts-fusion: true
eplb-algorithm: "deepseek"
disaggregation-bootstrap-port: 30001

# Decode-specific mode
disaggregation-mode: "decode"

# Memory and token limits
mem-fraction-static: 0.73
max-total-tokens: 3122380
chunked-prefill-size: 786432

# Request handling
max-running-requests: 33792 #67584

# DeepEP configuration
moe-a2a-backend: "deepep"
deepep-mode: "low_latency"
ep-dispatch-algorithm: "static"
ep-num-redundant-experts: 32

# CUDA graphs (extensive batch size list)
cuda-graph-bs: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] #, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 416, 448, 480, 512] #, 544, 576, 608, 640, 672, 704, 736, 768, 1024]
num-reserved-decode-tokens: 224 #112

# Additional decode optimizations
moe-dense-tp-size: 1
enable-dp-lm-head: true
prefill-round-robin-balance: true
enable-dp-attention: true

# Parallelism
tp-size: 32
dp-size: 32
ep-size: 32

# MTP
speculative-algorithm: "EAGLE"
speculative-num-steps: 1
speculative-eagle-topk: 1
speculative-num-draft-tokens: 2
speculative-moe-runner-backend: "cutlass"
speculative-moe-a2a-backend: "none"

benchmark:
type: "sa-bench"
isl: 1024
osl: 1024
concurrencies: "1x128x512x2048x4096x8192x12000x15000"
req_rate: "inf"