Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/nightly_test_a3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ jobs:
- name: multi-node-kimi-k2-instruct-w8a8
config_file_path: Kimi-K2-Instruct-W8A8.yaml
size: 2
- name: multi-node-deepseek-v3.1
config_file_path: DeepSeek-V3.1-BF16.yaml
size: 2
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
with:
soc_version: a3
Expand Down
82 changes: 82 additions & 0 deletions tests/e2e/nightly/multi_node/config/DeepSeek-V3.1-BF16.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
test_name: "test DeepSeek-V3.1-BF16 on A3"
model: "unsloth/DeepSeek-V3.1-BF16"
num_nodes: 2
npu_per_node: 16
env_common:
VLLM_USE_MODELSCOPE: true
HCCL_BUFFSIZE: 2048
SERVER_PORT: 8080
OMP_PROC_BIND: false
VLLM_ASCEND_ENABLE_FLASHCOMM1: 1
PYTORCH_NPU_ALLOC_CONF: expandable_segments:True
OMP_NUM_THREADS: 1
VLLM_ASCEND_ENABLE_MLAPO: 1
VLLM_ASCEND_BALANCE_SCHEDULING: 1
HCCL_INTRA_PCIE_ENABLE: 1
HCCL_INTRA_ROCE_ENABLE: 0

deployment:
-
server_cmd: >
vllm serve unsloth/DeepSeek-V3.1-BF16
--host 0.0.0.0
--port $SERVER_PORT
--data-parallel-size 4
--tensor-parallel-size 8
--data-parallel-size-local 2
--data-parallel-address $LOCAL_IP
--data-parallel-rpc-port 13399
--no-enable-prefix-caching
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--enable-expert-parallel
--trust-remote-code
--gpu-memory-utilization 0.95
--async-scheduling
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[2, 4, 8, 16, 32]}'
--additional_config '{"enable_multistream_moe": true}'

-
server_cmd: >
vllm serve unsloth/DeepSeek-V3.1-BF16
--headless
--data-parallel-size 4
--tensor-parallel-size 8
--data-parallel-size-local 2
--data-parallel-start-rank 2
--data-parallel-address $MASTER_IP
--data-parallel-rpc-port 13399
--no-enable-prefix-caching
--max-num-seqs 16
--max-model-len 8192
--max-num-batched-tokens 4096
--enable-expert-parallel
--trust-remote-code
--gpu-memory-utilization 0.95
--async-scheduling
--speculative-config '{"num_speculative_tokens": 1, "method":"mtp"}'
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[2, 4, 8, 16, 32]}'
--additional_config '{"enable_multistream_moe": true}'
benchmarks:
perf:
case_type: performance
dataset_path: vllm-ascend/GSM8K-in3500-bs2800
request_conf: vllm_api_stream_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf
num_prompts: 512
max_out_len: 512
batch_size: 700
request_rate: 11.2
baseline: 1
threshold: 0.97
acc:
case_type: accuracy
dataset_path: vllm-ascend/gsm8k-lite
request_conf: vllm_api_general_chat
dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt
max_out_len: 4096
batch_size: 512
baseline: 95
threshold: 5
Loading