-
Notifications
You must be signed in to change notification settings - Fork 194
dsv4-b300-sglang-mtp: restore TP4 and DP-attn search-space entries #1180
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
f2dda86
4190a94
dbde9fa
0bbd08d
1fadfce
b8a625d
01d7a9f
74d5b69
f66a2df
e45c425
d053ff6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,12 +6,12 @@ source "$(dirname "$0")/../benchmark_lib.sh" | |
| # TP -- tensor parallel size -> --tp | ||
| # EP_SIZE -- expert parallel size -> --ep-size | ||
| # DP_ATTENTION -- "true" enables --enable-dp-attention --dp-size $TP | ||
| # Also selects MoE backend / chunked-prefill-size: | ||
| # true -> deepep + mega_moe + chunked-prefill 32768 | ||
| # false -> flashinfer_mxfp4 + chunked-prefill 8192 | ||
| # | ||
| # EAGLE/MTP speculative-decoding flags are hardcoded to (3, 1, 4): num-steps=3, | ||
| # eagle-topk=1, num-draft-tokens=4. Same chain across all CONC bands. | ||
| # Also selects MoE backend / chunked-prefill / EAGLE chain | ||
| # / mem-fraction-static / max-running-requests: | ||
| # true -> flashinfer_mxfp4 + DP-attn + chunked-prefill 32768 | ||
| # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 | ||
| # false -> flashinfer_mxfp4 (TP-only) + chunked-prefill 8192 | ||
| # + EAGLE (3,1,4) + mem-fraction 0.90 + max-running CONC*3/2 | ||
| check_env_vars \ | ||
| MODEL \ | ||
| TP \ | ||
|
|
@@ -63,40 +63,53 @@ fi | |
|
|
||
| start_gpu_monitor --output "$PWD/gpu_metrics.csv" | ||
|
|
||
| # Recipe path is selected by DP_ATTENTION; MoE backend and chunked-prefill-size follow. | ||
| # Recipe path is selected by DP_ATTENTION; MoE backend, chunked-prefill, EAGLE | ||
| # chain, mem-fraction, and max-running all follow. | ||
| DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' | ||
|
|
||
| # MTP (EAGLE) speculative-decoding flags applied unconditionally on every recipe. | ||
| SPEC_FLAGS=( | ||
| --speculative-algorithm EAGLE | ||
| --speculative-num-steps 3 | ||
| --speculative-eagle-topk 1 | ||
| --speculative-num-draft-tokens 4 | ||
| ) | ||
|
|
||
| if [ "${DP_ATTENTION}" = "true" ]; then | ||
| # Large-batch EP path: deepep + mega_moe. | ||
| export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1 | ||
| export SGLANG_OPT_FIX_HASH_MEGA_MOE=1 | ||
| # DP-attn path: flashinfer_mxfp4 + DP-attn (covers conc 16-256). | ||
| export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1 | ||
| export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0 | ||
| export SGLANG_OPT_FIX_HASH_MEGA_MOE=0 | ||
| export SGLANG_OPT_USE_FAST_MASK_EP=1 | ||
| export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1 | ||
| export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096 | ||
| export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1 | ||
| export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0 | ||
| SPEC_FLAGS=( | ||
| --speculative-algorithm EAGLE | ||
| --speculative-num-steps 1 | ||
| --speculative-eagle-topk 1 | ||
| --speculative-num-draft-tokens 2 | ||
| ) | ||
| PARALLEL_ARGS=( | ||
| --dp-size "$TP" | ||
| --enable-dp-attention | ||
| --moe-a2a-backend deepep | ||
| --moe-runner-backend flashinfer_mxfp4 | ||
| --disable-flashinfer-autotune | ||
| --deepep-config "$DEEPEP_CONFIG" | ||
| --cuda-graph-max-bs 256 | ||
| --schedule-conservativeness 2 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain why
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still can be default, you can remove it
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
| ) | ||
| CHUNKED_PREFILL_SIZE=32768 | ||
| MEM_FRACTION_STATIC=0.92 | ||
| MAX_RUNNING_REQUESTS=256 | ||
| else | ||
| # Small-batch TP-only path: flashinfer_mxfp4. | ||
| # TP-only fallback for low-conc: flashinfer_mxfp4 + EAGLE (3,1,4). | ||
| SPEC_FLAGS=( | ||
| --speculative-algorithm EAGLE | ||
| --speculative-num-steps 3 | ||
| --speculative-eagle-topk 1 | ||
| --speculative-num-draft-tokens 4 | ||
| ) | ||
| PARALLEL_ARGS=( | ||
| --moe-runner-backend flashinfer_mxfp4 | ||
| --disable-flashinfer-autotune | ||
| ) | ||
| CHUNKED_PREFILL_SIZE=8192 | ||
| MEM_FRACTION_STATIC=0.90 | ||
| MAX_RUNNING_REQUESTS="$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" | ||
| fi | ||
|
|
||
| # Print all SGLANG_* env vars to both the CI step log and server.log so the | ||
|
|
@@ -116,8 +129,8 @@ PYTHONNOUSERSITE=1 sglang serve \ | |
| --tp $TP \ | ||
| --ep-size $EP_SIZE \ | ||
| --chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \ | ||
| --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ | ||
| --mem-fraction-static 0.90 \ | ||
| --max-running-requests "$MAX_RUNNING_REQUESTS" \ | ||
| --mem-fraction-static "$MEM_FRACTION_STATIC" \ | ||
| --swa-full-tokens-ratio 0.1 \ | ||
| "${SPEC_FLAGS[@]}" \ | ||
| "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is fine for now since we are in early stages of v4 support, but eventually we'd like these to just be default settings based on scenario in engine
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure