Skip to content
Merged
6 changes: 6 additions & 0 deletions .github/workflows/nightly_test_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ jobs:
- name: multi-node-qwenw8a8-2node-eplb
config_file_path: Qwen3-235B-W8A8-EPLB.yaml
size: 2
- name: multi-node-deepseek-r1-w8a8-longseq
config_file_path: DeepSeek-R1-W8A8-longseq.yaml
size: 2
- name: multi-node-qwenw8a8-2node-longseq
config_file_path: Qwen3-235B-W8A8-longseq.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill"
model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
HCCL_OP_RETRY_ENABLE: "L0:0, L1:0"

disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]

deployment:
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 1
--decode-context-parallel-size 8
--prefill-context-parallel-size 2
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enforce-eager
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 32768
--max-num-batched-tokens 16384
--trust-remote-code
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Subsequent cases need to be supplemented for TP asymmetry.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#Todo after #5224 merge

--gpu-memory-utilization 0.9
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'

-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--decode-context-parallel-size 2
--prefill-context-parallel-size 1
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--enable-expert-parallel
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 32768
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'

benchmarks:
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
batch_size: 512
baseline: 95
threshold: 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
test_name: "test Qwen3-235B-A22B-W8A8-longseq disaggregated_prefill"
model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
disaggregated_prefill:
enabled: true
prefiller_host_index: [0]
decoder_host_index: [1]

deployment:
-
server_cmd: >
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 1
--decode-context-parallel-size 2
--prefill-context-parallel-size 2
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--seed 1024
--enforce-eager
--enable-expert-parallel
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 8192
--quantization ascend
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'

-
server_cmd: >
vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8"
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--decode-context-parallel-size 2
--prefill-context-parallel-size 1
--tensor-parallel-size 8
--cp-kv-cache-interleave-size 128
--seed 1024
--quantization ascend
--max-num-seqs 16
--max-model-len 8192
Comment thread
dsxsteven marked this conversation as resolved.
--max-num-batched-tokens 8192
--enable-expert-parallel
--trust-remote-code
--no-enable-prefix-caching
--gpu-memory-utilization 0.9
Comment thread
dsxsteven marked this conversation as resolved.
--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30100",
"engine_id": "1",
"kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 8
},
"decode": {
"dp_size": 2,
"tp_size": 8
}
}
}'
benchmarks:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The benchmarks section is empty. This will result in the performance and accuracy tests for this model being skipped, making the test configuration ineffective. Please provide the necessary benchmark configurations for perf and acc, similar to the DeepSeek-R1-W8A8-longseq.yaml file.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dsxsteven is this skip expected?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Loading