-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[CI] Add multi-nodes longseq configs of DeepSeek-R1-W8A8 & Qwen3-235B-W8A8 #5381
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
61ddfea
f14e3ea
992f42c
7c81306
fa5f99b
245ea5e
7471c13
c2ff57a
f6e0a47
01a765e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,109 @@ | ||
| test_name: "test DeepSeek-R1-W8A8-longseq disaggregated_prefill" | ||
| model: "vllm-ascend/DeepSeek-R1-0528-W8A8" | ||
| num_nodes: 2 | ||
| npu_per_node: 16 | ||
| env_common: | ||
| VLLM_USE_MODELSCOPE: true | ||
| HCCL_BUFFSIZE: 1024 | ||
| SERVER_PORT: 8080 | ||
| OMP_PROC_BIND: false | ||
| OMP_NUM_THREADS: 10 | ||
| PYTORCH_NPU_ALLOC_CONF: expandable_segments:True | ||
| HCCL_DETERMINISTIC: True | ||
| TASK_QUEUE_ENABLE: 1 | ||
| HCCL_OP_RETRY_ENABLE: "L0:0, L1:0" | ||
|
|
||
| disaggregated_prefill: | ||
| enabled: true | ||
| prefiller_host_index: [0] | ||
| decoder_host_index: [1] | ||
|
|
||
| deployment: | ||
| - | ||
| server_cmd: > | ||
| vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 | ||
| --host 0.0.0.0 | ||
| --port $SERVER_PORT | ||
| --data-parallel-size 1 | ||
| --decode-context-parallel-size 8 | ||
| --prefill-context-parallel-size 2 | ||
| --tensor-parallel-size 8 | ||
| --cp-kv-cache-interleave-size 128 | ||
| --enforce-eager | ||
| --enable-expert-parallel | ||
| --seed 1024 | ||
| --quantization ascend | ||
| --max-num-seqs 4 | ||
| --max-model-len 32768 | ||
| --max-num-batched-tokens 16384 | ||
| --trust-remote-code | ||
| --gpu-memory-utilization 0.9 | ||
| --enable-chunked-prefill | ||
| --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' | ||
| --kv-transfer-config | ||
| '{"kv_connector": "MooncakeConnectorV1", | ||
| "kv_role": "kv_producer", | ||
| "kv_port": "30000", | ||
| "engine_id": "0", | ||
| "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", | ||
| "kv_connector_extra_config": { | ||
| "prefill": { | ||
| "dp_size": 1, | ||
| "tp_size": 8 | ||
| }, | ||
| "decode": { | ||
| "dp_size": 2, | ||
| "tp_size": 8 | ||
| } | ||
| } | ||
| }' | ||
|
|
||
| - | ||
| server_cmd: > | ||
| vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8 | ||
| --host 0.0.0.0 | ||
| --port $SERVER_PORT | ||
| --data-parallel-size 2 | ||
| --decode-context-parallel-size 2 | ||
| --prefill-context-parallel-size 1 | ||
| --tensor-parallel-size 8 | ||
| --cp-kv-cache-interleave-size 128 | ||
| --enable-expert-parallel | ||
| --seed 1024 | ||
| --quantization ascend | ||
| --max-num-seqs 4 | ||
| --max-model-len 32768 | ||
| --max-num-batched-tokens 256 | ||
| --trust-remote-code | ||
| --gpu-memory-utilization 0.9 | ||
| --compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}' | ||
| --enable-chunked-prefill | ||
| --speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}' | ||
| --kv-transfer-config | ||
| '{"kv_connector": "MooncakeConnectorV1", | ||
| "kv_role": "kv_consumer", | ||
| "kv_port": "30100", | ||
| "engine_id": "1", | ||
| "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", | ||
| "kv_connector_extra_config": { | ||
| "prefill": { | ||
| "dp_size": 1, | ||
| "tp_size": 8 | ||
| }, | ||
| "decode": { | ||
| "dp_size": 2, | ||
| "tp_size": 8 | ||
| } | ||
| } | ||
| }' | ||
|
|
||
| benchmarks: | ||
| acc: | ||
| case_type: accuracy | ||
| dataset_path: vllm-ascend/gsm8k | ||
| request_conf: vllm_api_general_chat | ||
| dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt | ||
| max_out_len: 32768 | ||
| batch_size: 512 | ||
| baseline: 95 | ||
| threshold: 5 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| test_name: "test Qwen3-235B-A22B-W8A8-longseq disaggregated_prefill" | ||
| model: "vllm-ascend/Qwen3-235B-A22B-W8A8" | ||
| num_nodes: 2 | ||
| npu_per_node: 16 | ||
| env_common: | ||
| VLLM_USE_MODELSCOPE: true | ||
| OMP_PROC_BIND: false | ||
| OMP_NUM_THREADS: 100 | ||
| HCCL_BUFFSIZE: 1024 | ||
| SERVER_PORT: 8080 | ||
| NUMEXPR_MAX_THREADS: 128 | ||
| disaggregated_prefill: | ||
| enabled: true | ||
| prefiller_host_index: [0] | ||
| decoder_host_index: [1] | ||
|
|
||
| deployment: | ||
| - | ||
| server_cmd: > | ||
| vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8" | ||
| --host 0.0.0.0 | ||
| --port $SERVER_PORT | ||
| --data-parallel-size 1 | ||
| --decode-context-parallel-size 2 | ||
| --prefill-context-parallel-size 2 | ||
| --tensor-parallel-size 8 | ||
| --cp-kv-cache-interleave-size 128 | ||
| --seed 1024 | ||
| --enforce-eager | ||
| --enable-expert-parallel | ||
| --max-num-seqs 16 | ||
| --max-model-len 8192 | ||
| --max-num-batched-tokens 8192 | ||
| --quantization ascend | ||
| --trust-remote-code | ||
| --no-enable-prefix-caching | ||
| --gpu-memory-utilization 0.9 | ||
| --kv-transfer-config | ||
| '{"kv_connector": "MooncakeConnectorV1", | ||
| "kv_role": "kv_producer", | ||
| "kv_port": "30000", | ||
| "engine_id": "0", | ||
| "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", | ||
| "kv_connector_extra_config": { | ||
| "prefill": { | ||
| "dp_size": 1, | ||
| "tp_size": 8 | ||
| }, | ||
| "decode": { | ||
| "dp_size": 2, | ||
| "tp_size": 8 | ||
| } | ||
| } | ||
| }' | ||
|
|
||
| - | ||
| server_cmd: > | ||
| vllm serve "vllm-ascend/Qwen3-235B-A22B-W8A8" | ||
| --host 0.0.0.0 | ||
| --port $SERVER_PORT | ||
| --data-parallel-size 2 | ||
| --decode-context-parallel-size 2 | ||
| --prefill-context-parallel-size 1 | ||
| --tensor-parallel-size 8 | ||
| --cp-kv-cache-interleave-size 128 | ||
| --seed 1024 | ||
| --quantization ascend | ||
| --max-num-seqs 16 | ||
| --max-model-len 8192 | ||
|
dsxsteven marked this conversation as resolved.
|
||
| --max-num-batched-tokens 8192 | ||
| --enable-expert-parallel | ||
| --trust-remote-code | ||
| --no-enable-prefix-caching | ||
| --gpu-memory-utilization 0.9 | ||
|
dsxsteven marked this conversation as resolved.
|
||
| --compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}' | ||
| --kv-transfer-config | ||
| '{"kv_connector": "MooncakeConnectorV1", | ||
| "kv_role": "kv_consumer", | ||
| "kv_port": "30100", | ||
| "engine_id": "1", | ||
| "kv_connector_module_path": "vllm_ascend.distributed.mooncake_connector", | ||
| "kv_connector_extra_config": { | ||
| "prefill": { | ||
| "dp_size": 1, | ||
| "tp_size": 8 | ||
| }, | ||
| "decode": { | ||
| "dp_size": 2, | ||
| "tp_size": 8 | ||
| } | ||
| } | ||
| }' | ||
| benchmarks: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dsxsteven is this skip expected?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Subsequent cases need to be supplemented for TP asymmetry.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#Todo after #5224 merge