Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/schedule_nightly_test_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ jobs:
- name: multi-node-deepseek-v3.1
config_file_path: DeepSeek-V3.1-BF16.yaml
size: 2
- name: multi-node-deepseek-v3.2-W8A8-EP
config_file_path: DeepSeek-V3_2-W8A8-EP.yaml
size: 4
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
Expand Down
234 changes: 234 additions & 0 deletions tests/e2e/nightly/multi_node/config/DeepSeek-V3_2-W8A8-EP.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
test_name: "test DeepSeek-V3.2-W8A8-EP disaggregated_prefill"
model: "vllm-ascend/DeepSeek-V3.2-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: "AIV"
VLLM_USE_MODELSCOPE: true
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
HCCL_BUFFSIZE: 1024
VLLM_TORCH_PROFILER_WITH_STACK: 0
ASCEND_AGGREGATE_ENABLE: 1
ASCEND_TRANSPORT_PRINT: 1
ACL_OP_INIT_MODE: 1
ASCEND_A3_ENABLE: 1
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: 300000
VLLM_ENGINE_READY_TIMEOUT_S: 1800
HCCL_CONNECT_TIMEOUT: 1200
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0

disaggregated_prefill:
enabled: true
prefiller_host_index: [0, 1]
decoder_host_index: [2, 3]


deployment:
-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 0
--data-parallel-size-local 1
--data-parallel-address $LOCAL_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'


-
envs:
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 2
--data-parallel-start-rank 1
--data-parallel-size-local 1
--data-parallel-address $MASTER_IP
--tensor-parallel-size 16
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-num-seqs 64
--max-model-len 68000
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.82
--enforce-eager
--no-enable-prefix-caching
--additional-config '{"enable_cpu_binding" : false, "enable_sfa_cp":false,"layer_sharding": ["q_b_proj", "o_proj"]}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "30000",
"engine_id": "0",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 0
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'

-
envs:
VLLM_ASCEND_ENABLE_MLAPO: 1
TASK_QUEUE_ENABLE: 1
server_cmd: >
vllm serve vllm-ascend/DeepSeek-V3.2-W8A8
--host 0.0.0.0
--headless
--port $SERVER_PORT
--data-parallel-size 8
--data-parallel-size-local 4
--data-parallel-start-rank 4
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13389
--tensor-parallel-size 4
--enable-expert-parallel
--speculative-config '{"num_speculative_tokens": 2, "method":"deepseek_mtp"}'
--seed 1024
--quantization ascend
--max-model-len 68000
--max-num-batched-tokens 12
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3, 6, 9, 12]}'
--trust-remote-code
--max-num-seqs 4
--gpu-memory-utilization 0.95
--no-enable-prefix-caching
--async-scheduling
--additional-config '{"enable_cpu_binding" : false,"recompute_scheduler_enable" : true}'
--tokenizer-mode deepseek_v32
--reasoning-parser deepseek_v3
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "30200",
"engine_id": "1",
"kv_connector_extra_config": {
"use_ascend_direct": true,
"prefill": {
"dp_size": 2,
"tp_size": 16
},
"decode": {
"dp_size": 8,
"tp_size": 4
}
}
}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 1500
batch_size: 512
request_rate: 11.2
baseline: 1146
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 64
baseline: 95
threshold: 5
4 changes: 2 additions & 2 deletions tests/e2e/nightly/multi_node/scripts/multi_node_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def __enter__(self):
if not self.is_master or self.cfg is None:
logger.info("Not launching proxy on non-master node")
return self
prefiller_ips = [self.nodes[i].ip for i in self.cfg.prefiller_indices]
decoder_ips = [self.nodes[i].ip for i in self.cfg.decoder_indices]
prefiller_ips = [self.nodes[i].ip for i in self.cfg.prefiller_indices if not self.nodes[i].headless]
decoder_ips = [self.nodes[i].ip for i in self.cfg.decoder_indices if not self.nodes[i].headless]

cmd = [
"python",
Expand Down