Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 19 additions & 14 deletions tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8-A2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 8
env_common:
VLLM_ASCEND_ENABLE_MLAPO: 1
VLLM_ASCEND_BALANCE_SCHEDULING: 1
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
HCCL_BUFFSIZE: 200
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
OMP_NUM_THREADS: 1


deployment:
Expand All @@ -23,15 +28,15 @@ deployment:
--no-enable-prefix-caching
--max-num-seqs 16
--tensor-parallel-size 4
--max-model-len 36864
--max-num-batched-tokens 6000
--max-model-len 16384
--max-num-batched-tokens 4096
--enable-expert-parallel
--async-scheduling
--trust-remote-code
--quantization ascend
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--additional-config '{"enable_weight_nz_layout":true}'
--gpu-memory-utilization 0.92
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}'

-
server_cmd: >
Expand All @@ -45,13 +50,13 @@ deployment:
--no-enable-prefix-caching
--max-num-seqs 16
--tensor-parallel-size 4
--max-model-len 36864
--max-num-batched-tokens 6000
--max-model-len 16384
--max-num-batched-tokens 4096
--enable-expert-parallel
--async-scheduling
--trust-remote-code
--quantization ascend
--gpu-memory-utilization 0.9
--enforce-eager
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--additional-config '{"enable_weight_nz_layout":true}'
--gpu-memory-utilization 0.92
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--compilation-config '{"cudagraph_capture_sizes":[4,16,32,48,64], "cudagraph_mode": "FULL_DECODE_ONLY"}'
benchmarks:
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ deployment:
}
}'
--additional-config
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'

-
server_cmd: >
Expand Down Expand Up @@ -94,7 +94,7 @@ deployment:
}
}'
--additional-config
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true,"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
'{"dynamic_eplb":true,"num_iterations_eplb_update":2048,"num_wait_worker_iterations":200}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
HCCL_BUFFSIZE: 768
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
Expand All @@ -34,10 +35,10 @@ deployment:
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-model-len 32768
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--gpu-memory-utilization 0.85
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
--kv-transfer-config
Expand Down Expand Up @@ -72,10 +73,10 @@ deployment:
--seed 1024
--quantization ascend
--max-num-seqs 4
--max-model-len 36864
--max-model-len 32768
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--gpu-memory-utilization 0.85
--compilation_config '{"cudagraph_capture_sizes":[4,8,12,16],"cudagraph_mode": "FULL_DECODE_ONLY"}'
--enable-chunked-prefill
--speculative-config '{"num_speculative_tokens": 3, "method":"mtp"}'
Expand Down Expand Up @@ -103,7 +104,7 @@ benchmarks:
dataset_path: vllm-ascend/gsm8k
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 32768
max_out_len: 24576
batch_size: 512
baseline: 95
threshold: 5
25 changes: 20 additions & 5 deletions tests/e2e/nightly/multi_node/config/DeepSeek-R1-W8A8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
num_nodes: 4
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
OMP_PROC_BIND: false
OMP_NUM_THREADS: 10
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_DETERMINISTIC: True
TASK_QUEUE_ENABLE: 1
Expand Down Expand Up @@ -36,6 +37,7 @@ deployment:
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
Expand All @@ -55,7 +57,7 @@ deployment:
}
}'
--additional-config
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"recompute_scheduler_enable":true}'

-
server_cmd: >
Expand All @@ -74,6 +76,7 @@ deployment:
--max-num-batched-tokens 16384
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
Expand All @@ -93,7 +96,7 @@ deployment:
}
}'
--additional-config
'{"enable_prefill_optimizations":true,"enable_weight_nz_layout":true}'
'{"recompute_scheduler_enable":true}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
Expand All @@ -113,7 +116,9 @@ deployment:
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
Expand All @@ -132,7 +137,11 @@ deployment:
}
}'
--additional-config
'{"multistream_overlap_shared_expert":true}'
'{"recompute_scheduler_enable":true,
"enable_shared_expert_dp":true,
"multistream_overlap_shared_expert":true,
"finegrained_tp_config": {"lmhead_tensor_parallel_size":8}
}'
-
server_cmd: >
vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
Expand All @@ -151,7 +160,9 @@ deployment:
--max-num-batched-tokens 256
--trust-remote-code
--gpu-memory-utilization 0.9
--no-enable-prefix-caching
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'
--kv-transfer-config
'{"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
Expand All @@ -170,7 +181,11 @@ deployment:
}
}'
--additional-config
'{"multistream_overlap_shared_expert":true}'
'{"recompute_scheduler_enable":true,
"enable_shared_expert_dp":true,
"multistream_overlap_shared_expert":true,
"finegrained_tp_config": {"lmhead_tensor_parallel_size":8}
}'
benchmarks:
perf:
case_type: performance
Expand Down
3 changes: 2 additions & 1 deletion tests/e2e/nightly/multi_node/config/DeepSeek-V3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
OMP_NUM_THREADS: 1
TASK_QUEUE_ENABLE: 1
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B-A2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ deployment:
--enable-expert-parallel
--max-num-seqs 128
--max-model-len 40960
--max-num-batched-tokens 256
--max-num-batched-tokens 2048
--trust-remote-code
--gpu-memory-utilization 0.9
--async-scheduling
Expand All @@ -44,7 +44,7 @@ deployment:
--seed 1024
--max-num-seqs 128
--max-model-len 40960
--max-num-batched-tokens 256
--max-num-batched-tokens 2048
--enable-expert-parallel
--trust-remote-code
--gpu-memory-utilization 0.9
Expand Down
5 changes: 4 additions & 1 deletion tests/e2e/nightly/multi_node/config/Qwen3-235B-A22B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ model: "Qwen/Qwen3-235B-A22B"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
TASK_QUEUE_ENABLE: 1
VLLM_USE_MODELSCOPE: true
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
OMP_NUM_THREADS: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
TASK_QUEUE_ENABLE: 1
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
OMP_NUM_THREADS: 1
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
DYNAMIC_EPLB: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
TASK_QUEUE_ENABLE: 1
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
OMP_NUM_THREADS: 1
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
Expand Down
4 changes: 3 additions & 1 deletion tests/e2e/nightly/multi_node/config/Qwen3-235B-W8A8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ model: "vllm-ascend/Qwen3-235B-A22B-W8A8"
num_nodes: 2
npu_per_node: 16
env_common:
HCCL_OP_EXPANSION_MODE: AIV
VLLM_USE_MODELSCOPE: true
TASK_QUEUE_ENABLE: 1
OMP_PROC_BIND: false
OMP_NUM_THREADS: 100
OMP_NUM_THREADS: 1
HCCL_BUFFSIZE: 1024
SERVER_PORT: 8080
NUMEXPR_MAX_THREADS: 128
Expand Down
3 changes: 3 additions & 0 deletions vllm_ascend/attention/context_parallel/mla_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def __init__(
self.decode_threshold,
dtype=torch.uint8,
device=device)
self.block_size = (self.block_size *
self.cp_virtual_block_size) // np.gcd(
self.block_size, self.cp_virtual_block_size)

def build(
self,
Expand Down
Loading