Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ jobs:
- name: Decode kubeconfig from secrets
run: |
# Decode and save kubeconfig
echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
# echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
cp /root/.cache/.kube/kubeconfig.yaml $KUBECONFIG

- name: Checkout code
uses: actions/checkout@v6
Expand Down
69 changes: 42 additions & 27 deletions .github/workflows/nightly_test_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ on:
pull_request:
branches:
- 'main'
types: [ labeled ]
# types: [ labeled ]
push:
branches:
- 'main'

# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
Expand All @@ -44,39 +47,51 @@ concurrency:
jobs:
multi-node-tests:
name: multi-node
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
# if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
strategy:
fail-fast: false
max-parallel: 1
matrix:
test_config:
- name: multi-node-deepseek-pd
config_file_path: DeepSeek-V3.yaml
size: 2
- name: multi-node-qwen3-dp
config_file_path: Qwen3-235B-A22B.yaml
size: 2
# - name: multi-node-dpsk-4node-pd
# config_file_path: DeepSeek-R1-W8A8.yaml
# size: 4
- name: multi-node-qwenw8a8-2node
config_file_path: Qwen3-235B-W8A8.yaml
size: 2
# - name: multi-node-deepseek-r1-w8a8-eplb
# config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
# size: 4
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
- name: multi-node-dpsk3.2-2node
config_file_path: DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-longseq
config_file_path: DeepSeek-R1-W8A8-longseq.yaml
# - name: multi-node-deepseek-pd
# config_file_path: DeepSeek-V3.yaml
# size: 2
# - name: multi-node-qwen3-dp
# config_file_path: Qwen3-235B-A22B.yaml
# size: 2
# # - name: multi-node-dpsk-4node-pd
# # config_file_path: DeepSeek-R1-W8A8.yaml
# # size: 4
# - name: multi-node-qwenw8a8-2node
# config_file_path: Qwen3-235B-W8A8.yaml
# size: 2
# # - name: multi-node-deepseek-r1-w8a8-eplb
# # config_file_path: DeepSeek-R1-W8A8-EPLB.yaml
# # size: 4
# - name: multi-node-qwenw8a8-2node-eplb
# config_file_path: Qwen3-235B-W8A8-EPLB.yaml
# size: 2
- name: multi-node-qwen3-vl-235b-a22b
config_file_path: Qwen3-VL-235B-A22B-Instruct.yaml
size: 2
- name: multi-node-qwenw8a8-2node-longseq
config_file_path: Qwen3-235B-W8A8-longseq.yaml
# - name: multi-node-dpsk3.2-2node
# config_file_path: DeepSeek-V3_2-W8A8-A3-dual-nodes.yaml
# size: 2
# - name: multi-node-deepseek-r1-w8a8-longseq
# config_file_path: DeepSeek-R1-W8A8-longseq.yaml
# size: 2
# - name: multi-node-qwenw8a8-2node-longseq
# config_file_path: Qwen3-235B-W8A8-longseq.yaml
# size: 2
- name: multi-node-qwen3-235b-a22b-pd
config_file_path: Qwen3-235B-A22B-pd.yaml
size: 2
# - name: multi-node-deepseek-r1-w8a8-longseq
# config_file_path: DeepSeek-R1-W8A8-longseq.yaml
# size: 2
# - name: multi-node-qwenw8a8-2node-longseq
# config_file_path: Qwen3-235B-W8A8-longseq.yaml
# size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
Expand Down
121 changes: 121 additions & 0 deletions tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-pd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
test_name: "test Qwen3-235B-A22B pd online"
model: "Qwen/Qwen3-235B-A22B"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_BUFFSIZE: 1024
HCCL_OP_EXPANSION_MODE: "AIV"
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
VLLM_ASCEND_ENABLE_FUSED_MC2: 2
TASK_QUEUE_ENABLE: 1
SERVER_PORT: 8080

disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]

deployment:
-
server_cmd: >
vllm serve "Qwen/Qwen3-235B-A22B"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 8
--seed 1024
--max-num-seqs 32
--max-model-len 8192
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 4,
"tp_size": 4
}
}
}'

-
server_cmd: >
vllm serve "Qwen/Qwen3-235B-A22B"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 4
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--seed 1024
--max-num-seqs 32
--max-model-len 8192
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
--async-scheduling
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 4,
"tp_size": 4
}
}
}'

benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 2800
max_out_len: 1500
batch_size: 700
request_rate: 11.2
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 7680
batch_size: 512
baseline: 97
threshold: 3
108 changes: 108 additions & 0 deletions tests/e2e/nightly/multi_node/config/Qwen3-VL-235B-A22B-Instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
test_name: "test qwen3-vl-235b-a22b disaggregated_prefill on A3"
model: "Qwen/Qwen3-VL-235B-A22B-Instruct"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 1
HCCL_OP_EXPANSION_MODE: "AIV"
TASK_QUEUE_ENABLE: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True

disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]

deployment:
-
server_cmd: >
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-size-local 2
--tensor-parallel-size 8
--seed 1024
--enable-expert-parallel
--max-num-seqs 32
--max-model-len 8192
--max-num-batched-tokens 8192
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 4,
"tp_size": 4
}
}
}'
-
server_cmd: >
vllm serve "Qwen/Qwen3-VL-235B-A22B-Instruct"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--data-parallel-size-local 4
--tensor-parallel-size 4
--seed 1024
--enable-expert-parallel
--max-num-seqs 32
--max-model-len 8192
--max-num-batched-tokens 8192
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 2,
"tp_size": 8
},
"decode": {
"dp_size": 4,
"tp_size": 4
}
}
}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/textvqa-perf-1080p
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
num_prompts: 2800
max_out_len: 1500
batch_size: 64
request_rate: 11.2
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/textvqa-lite
request_conf: vllm_api_stream_chat
dataset_conf: textvqa/textvqa_gen_base64
max_out_len: 7680
batch_size: 64
baseline: 85
threshold: 5
13 changes: 13 additions & 0 deletions tests/e2e/nightly/multi_node/scripts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,18 @@ kill_npu_processes() {
sleep 4
}

upgrade_vllm_ascend_scr() {
# Fix me(Potabk): Remove this once our image build use
# The separate architecture build process currently suffers from errors during cross-compilation
# causing the image to fail to build correctly.
# This results in the nightly test code not being the latest version.
cd "$WORKSPACE/vllm-ascend"
#git pull origin main
git fetch origin pull/5442/head:pr-5442
git checkout pr-5442

}
Comment thread
MrZ20 marked this conversation as resolved.

run_tests_with_log() {
set +e
kill_npu_processes
Expand All @@ -181,6 +193,7 @@ main() {
if [[ "$CONFIG_YAML_PATH" == *"DeepSeek-V3_2-Exp-bf16.yaml" ]]; then
install_extra_components
fi
upgrade_vllm_ascend_scr
cd "$WORKSPACE/vllm-ascend"
run_tests_with_log
}
Expand Down
8 changes: 6 additions & 2 deletions vllm_ascend/ascend_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,12 @@ def __init__(self, vllm_config: "VllmConfig"):
decode_tp_size = min(decode_tp_size, num_kv_head)
self.pd_head_ratio = prefill_tp_size // decode_tp_size
except Exception:
raise AssertionError(
"Can not get num_key_value_heads from model_config")
raise ValueError(
"The text_config extracted from the model config does not have "
"`num_key_value_heads` attribute. This indicates a mismatch "
"between the model config and vLLM's expectations. Please "
"ensure that the model config is compatible with vLLM."
)

if self.pd_tp_ratio == 0:
raise AssertionError(
Expand Down
Loading