-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
[CI] Add Async Eplb nightly CI tests #29385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
a260e08
3689a83
62d95a9
2f25913
a8f8f2e
abd244a
211cf79
44a1ae7
fa5d1ac
e903e99
ef0c811
c35dad6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| #!/usr/bin/env bash | ||
| set -euxo pipefail | ||
|
|
||
| # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] | ||
| THRESHOLD=${1:-0.25} | ||
| NUM_Q=${2:-1319} | ||
| PORT=${3:-8030} | ||
| OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} | ||
| mkdir -p "${OUT_DIR}" | ||
|
|
||
| wait_for_server() { | ||
| local port=$1 | ||
| timeout 600 bash -c ' | ||
| until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do | ||
| sleep 1 | ||
| done' | ||
| } | ||
|
|
||
| MODEL="deepseek-ai/DeepSeek-V2-lite" | ||
|
|
||
| # Set BACKENDS based on platform | ||
| if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then | ||
| # ROCm platform | ||
| BACKENDS=("allgather_reducescatter") | ||
| # Disable MOE padding for ROCm since it is causing eplb to fail | ||
| export VLLM_ROCM_MOE_PADDING=0 | ||
| else | ||
| # Non-ROCm platform (CUDA/other) | ||
| BACKENDS=("deepep_high_throughput" "deepep_low_latency") | ||
| fi | ||
|
|
||
| cleanup() { | ||
| if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then | ||
| kill "${SERVER_PID}" 2>/dev/null || true | ||
| for _ in {1..20}; do | ||
| kill -0 "${SERVER_PID}" 2>/dev/null || break | ||
| sleep 0.5 | ||
| done | ||
| kill -9 "${SERVER_PID}" 2>/dev/null || true | ||
| fi | ||
| } | ||
| trap cleanup EXIT | ||
|
|
||
| for BACK in "${BACKENDS[@]}"; do | ||
| VLLM_DEEP_GEMM_WARMUP=skip \ | ||
| VLLM_ALL2ALL_BACKEND=$BACK \ | ||
| vllm serve "$MODEL" \ | ||
| --enforce-eager \ | ||
| --tensor-parallel-size 2 \ | ||
| --data-parallel-size 2 \ | ||
| --enable-expert-parallel \ | ||
| --enable-eplb \ | ||
| --eplb-config '{"window_size":200,"step_interval":600,"use_async":"true"}' \ | ||
| --trust-remote-code \ | ||
| --max-model-len 2048 \ | ||
| --port $PORT & | ||
| SERVER_PID=$! | ||
| wait_for_server $PORT | ||
|
|
||
| TAG=$(echo "$MODEL" | tr '/: \\n' '_____') | ||
| OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" | ||
| python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} | ||
| python3 - <<PY | ||
| import json; acc=json.load(open('${OUT}'))['accuracy'] | ||
| print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") | ||
| assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" | ||
| PY | ||
|
|
||
| cleanup | ||
| SERVER_PID= | ||
| sleep 1 | ||
| PORT=$((PORT+1)) | ||
| done | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do | |
| --data-parallel-size 2 \ | ||
| --enable-expert-parallel \ | ||
| --enable-eplb \ | ||
| --eplb-config '{"window_size":200,"step_interval":600}' \ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch, not sure the default step interval actually triggering eplb here |
||
| --trust-remote-code \ | ||
| --max-model-len 2048 \ | ||
| --port $PORT & | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,74 @@ | ||||||
| #!/usr/bin/env bash | ||||||
| set -euxo pipefail | ||||||
|
|
||||||
| # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] | ||||||
| THRESHOLD=${1:-0.25} | ||||||
| NUM_Q=${2:-1319} | ||||||
| PORT=${3:-8040} | ||||||
| OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} | ||||||
| mkdir -p "${OUT_DIR}" | ||||||
|
|
||||||
| wait_for_server() { | ||||||
| local port=$1 | ||||||
| timeout 600 bash -c ' | ||||||
| until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do | ||||||
| sleep 1 | ||||||
| done' | ||||||
| } | ||||||
|
|
||||||
| MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" | ||||||
|
|
||||||
| # Set BACKENDS based on platform | ||||||
| if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then | ||||||
| # ROCm platform | ||||||
| BACKENDS=("allgather_reducescatter") | ||||||
| # Disable MOE padding for ROCm since it is causing eplb to fail | ||||||
| export VLLM_ROCM_MOE_PADDING=0 | ||||||
| else | ||||||
| # Non-ROCm platform (CUDA/other) | ||||||
| BACKENDS=("deepep_high_throughput" "deepep_low_latency") | ||||||
| fi | ||||||
|
|
||||||
| cleanup() { | ||||||
| if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then | ||||||
| kill "${SERVER_PID}" 2>/dev/null || true | ||||||
| for _ in {1..20}; do | ||||||
| kill -0 "${SERVER_PID}" 2>/dev/null || break | ||||||
| sleep 0.5 | ||||||
| done | ||||||
| kill -9 "${SERVER_PID}" 2>/dev/null || true | ||||||
| fi | ||||||
| } | ||||||
| trap cleanup EXIT | ||||||
|
|
||||||
| for BACK in "${BACKENDS[@]}"; do | ||||||
| VLLM_DEEP_GEMM_WARMUP=skip \ | ||||||
| VLLM_ALL2ALL_BACKEND=$BACK \ | ||||||
| vllm serve "$MODEL" \ | ||||||
| --enforce-eager \ | ||||||
| --tensor-parallel-size 4 \ | ||||||
| --enable-expert-parallel \ | ||||||
| --enable-eplb \ | ||||||
| --eplb-config '{"window_size":200,"step_interval":600,"use_async":"true"}' \ | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||
| --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ | ||||||
| --trust-remote-code \ | ||||||
| --max-model-len 2048 \ | ||||||
| --gpu-memory-utilization 0.9 \ | ||||||
| --port $PORT & | ||||||
| SERVER_PID=$! | ||||||
| wait_for_server $PORT | ||||||
|
|
||||||
| TAG=$(echo "$MODEL" | tr '/: \\n' '_____') | ||||||
| OUT="${OUT_DIR}/${TAG}_${BACK}.json" | ||||||
| python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} | ||||||
| python3 - <<PY | ||||||
| import json; acc=json.load(open('${OUT}'))['accuracy'] | ||||||
| print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") | ||||||
| assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" | ||||||
| PY | ||||||
|
|
||||||
| cleanup | ||||||
| SERVER_PID= | ||||||
| sleep 1 | ||||||
| PORT=$((PORT+1)) | ||||||
| done | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
use_asyncparameter in the JSON for--eplb-configis specified as the string"true". For it to be correctly parsed as a boolean value by Pydantic, it should be the JSON boolean literaltrue(without quotes). With the current value, the async feature will not be enabled, which defeats the purpose of this test.