diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index 40cb9d64b90..03da7c9ebb2 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -28,7 +28,10 @@ on: type: string default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.1-910b-ubuntu22.04-py3.11" tests: - required: true + required: false + type: string + config_file_path: + required: false type: string name: required: false @@ -44,12 +47,12 @@ defaults: # only cancel in-progress runs of the same workflow # and ignore the lint / 1 card / 4 cards test type concurrency: - group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.tests }} + group: ascend-nightly-${{ github.workflow_ref }}-${{ github.ref }}-${{ inputs.config_file_path || inputs.tests }} cancel-in-progress: true jobs: e2e-nightly: - name: ${{ inputs.tests }} + name: ${{ inputs.name || inputs.config_file_path || inputs.tests }} runs-on: ${{ inputs.runner }} timeout-minutes: 600 container: @@ -114,14 +117,33 @@ jobs: update-alternatives --install /usr/bin/clang clang /usr/bin/clang-15 20 update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-15 20 - - name: Run vllm-project/vllm-ascend test + - name: Validate Inputs + run: | + if [[ -z "${{ inputs.tests }}" && -z "${{ inputs.config_file_path }}" ]]; then + echo "Error: Either 'tests' or 'config_file_path' must be provided." + exit 1 + fi + + - name: Run Pytest (py-driven) + if: ${{ inputs.tests != '' }} env: VLLM_WORKER_MULTIPROC_METHOD: spawn VLLM_USE_MODELSCOPE: True VLLM_CI_RUNNER: ${{ inputs.runner }} - BENCHMARK_HOME: /vllm-workspace/vllm-ascend/benchmark working-directory: /vllm-workspace/vllm-ascend run: | - # ignore test_dispatch_ffn_combine until the test is fixed - pytest -sv ${{ inputs.tests }} \ + echo "Running pytest with tests path: ${{ inputs.tests }}" + pytest -sv "${{ inputs.tests }}" \ --ignore=tests/e2e/nightly/single_node/ops/singlecard_ops/test_fused_moe.py + + - name: Run Pytest (YAML-driven) + if: ${{ always() && inputs.config_file_path != '' }} + env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + VLLM_USE_MODELSCOPE: True + VLLM_CI_RUNNER: ${{ inputs.runner }} + CONFIG_YAML_PATH: ${{ inputs.config_file_path }} + working-directory: /vllm-workspace/vllm-ascend + run: | + echo "Running YAML-driven test with config: ${{ inputs.config_file_path }}" + pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py diff --git a/.github/workflows/schedule_nightly_test_a2.yaml b/.github/workflows/schedule_nightly_test_a2.yaml index abd2c23b8c1..7eed141489f 100644 --- a/.github/workflows/schedule_nightly_test_a2.yaml +++ b/.github/workflows/schedule_nightly_test_a2.yaml @@ -49,15 +49,6 @@ jobs: fail-fast: false matrix: test_config: - - name: qwen3-next - os: linux-aarch64-a2b3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_next.py - - name: qwen3-32b - os: linux-aarch64-a2b3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_32b.py - - name: qwen3-32b-in8-a2 - os: linux-aarch64-a2b3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py - name: test_custom_op os: linux-aarch64-a2b3-1 tests: tests/e2e/nightly/single_node/ops/singlecard_ops @@ -71,10 +62,33 @@ jobs: name: ${{ matrix.test_config.name }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' + single-node-yaml-tests: + name: single-node + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + strategy: + fail-fast: false + matrix: + test_config: + - name: qwen3-32b + os: linux-aarch64-a2b3-4 + config_file_path: Qwen3-32B.yaml + - name: qwen3-next-80b-a3b-instruct + os: linux-aarch64-a2b3-4 + config_file_path: Qwen3-Next-80B-A3B-Instruct-A2.yaml + - name: qwen3-32b-int8 + os: linux-aarch64-a2b3-4 + config_file_path: Qwen3-32B-Int8-A2.yaml + uses: ./.github/workflows/_e2e_nightly_single_node.yaml + with: + runner: ${{ matrix.test_config.os }} + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a2' + config_file_path: ${{ matrix.test_config.config_file_path }} + name: ${{ matrix.test_config.name }} + multi-node-tests: name: multi-node if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: single-node-tests + needs: [single-node-tests, single-node-yaml-tests] strategy: fail-fast: false max-parallel: 1 diff --git a/.github/workflows/schedule_nightly_test_a3.yaml b/.github/workflows/schedule_nightly_test_a3.yaml index 940f851817c..f42bd730df7 100644 --- a/.github/workflows/schedule_nightly_test_a3.yaml +++ b/.github/workflows/schedule_nightly_test_a3.yaml @@ -109,73 +109,83 @@ jobs: single-node-tests: name: single-node if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') - needs: multi-node-tests + needs: [multi-node-tests] strategy: fail-fast: false matrix: test_config: - - name: qwen3-32b-in8-a3 - os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py - - name: qwen3-32b-int8-a3-feature-stack3 + - name: qwen3-30b-acc os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py - - name: qwen3-235b-a22b-w8a8-eplb + tests: tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py + uses: ./.github/workflows/_e2e_nightly_single_node.yaml + with: + runner: ${{ matrix.test_config.os }} + image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' + tests: ${{ matrix.test_config.tests }} + name: ${{ matrix.test_config.name }} + + single-node-yaml-tests: + name: single-node + if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') + needs: [multi-node-tests] + strategy: + fail-fast: false + matrix: + test_config: + # YAML-driven tests + - name: deepseek-r1-0528-w8a8 + os: linux-aarch64-a3-16 + config_file_path: DeepSeek-R1-0528-W8A8.yaml + - name: deepseek-r1-w8a8-hbm os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py - - name: deepseek-r1-w8a8-eplb + config_file_path: DeepSeek-R1-W8A8-HBM.yaml + - name: deepseek-v3-2-w8a8 + os: linux-aarch64-a3-16 + config_file_path: DeepSeek-V3.2-W8A8.yaml + - name: kimi-k2-thinking os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py - - name: deepseek-r1-w8a8-mtpx + config_file_path: Kimi-K2-Thinking.yaml + - name: mtpx-deepseek-r1-0528-w8a8 os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py + config_file_path: MTPX-DeepSeek-R1-0528-W8A8.yaml + - name: qwen3-235b-a22b-w8a8 + os: linux-aarch64-a3-16 + config_file_path: Qwen3-235B-A22B-W8A8.yaml + - name: qwen3-30b-a3b-w8a8 + os: linux-aarch64-a3-4 + config_file_path: Qwen3-30B-A3B-W8A8.yaml + - name: qwen3-next-80b-a3b-instruct-w8a8 + os: linux-aarch64-a3-4 + config_file_path: Qwen3-Next-80B-A3B-Instruct-W8A8.yaml + - name: qwq-32b + os: linux-aarch64-a3-4 + config_file_path: QwQ-32B.yaml + - name: qwen3-32b-int8 + os: linux-aarch64-a3-4 + config_file_path: Qwen3-32B-Int8.yaml - name: qwen2-5-vl-7b os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py + config_file_path: Qwen2.5-VL-7B-Instruct.yaml - name: qwen2-5-vl-7b-epd os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py + config_file_path: Qwen2.5-VL-7B-Instruct-EPD.yaml - name: qwen2-5-vl-32b os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py + config_file_path: Qwen2.5-VL-32B-Instruct.yaml + - name: qwen3-32b-int8-a3-feature-stack3 + os: linux-aarch64-a3-4 + config_file_path: Qwen3-32B-Int8-A3-Feature-Stack3.yaml - name: qwen3-32b-int8-prefix-cache os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py - - name: deepseek-r1-0528-w8a8 - os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py + config_file_path: Prefix-Cache-Qwen3-32B-Int8.yaml - name: deepseek-r1-0528-w8a8-prefix-cache os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py - - name: qwq-32b-a3 - os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwq_32b.py - - name: qwen3-30b-w8a8 - os: linux-aarch64-a3-2 - tests: tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py - - name: qwen3-235b-w8a8 - os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py - - name: qwen3-next-w8a8 - os: linux-aarch64-a3-4 - tests: tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py - - name: kimi-k2-thinking - os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py - - name: deepseek-r1-w8a8-hbm - os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py - - name: deepseek3_2-w8a8 - os: linux-aarch64-a3-16 - tests: tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py - - name: qwen3-30b-acc - os: linux-aarch64-a3-4 - tests: tests/e2e/weekly/single_node/models/test_qwen3_30b_acc.py + config_file_path: Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml uses: ./.github/workflows/_e2e_nightly_single_node.yaml with: runner: ${{ matrix.test_config.os }} image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:nightly-a3' - tests: ${{ matrix.test_config.tests }} + config_file_path: ${{ matrix.test_config.config_file_path }} name: ${{ matrix.test_config.name }} custom-ops-tests: diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-0528-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-0528-W8A8.yaml new file mode 100644 index 00000000000..c57f58230f1 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-0528-W8A8.yaml @@ -0,0 +1,94 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--data-parallel-size" + - "2" + - "--tensor-parallel-size" + - "8" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--seed" + - "1024" + - "--max-model-len" + - "36864" + - "--max-num-batched-tokens" + - "4096" + - "--max-num-seqs" + - "16" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--speculative-config" + - '{"num_speculative_tokens": 1, "method": "mtp"}' + - "--additional-config" + - '{"enable_weight_nz_layout": true}' + +_benchmarks_acc: &benchmarks_acc + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 95 + threshold: 5 + +_benchmarks_perf: &benchmarks_perf + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 400 + max_out_len: 1500 + batch_size: 1000 + baseline: 1 + threshold: 0.97 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "DeepSeek-R1-0528-W8A8-single" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--enforce-eager" + benchmarks: + + - name: "DeepSeek-R1-0528-W8A8-aclgraph" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + benchmarks: + <<: *benchmarks_acc + <<: *benchmarks_perf + + - name: "DeepSeek-R1-0528-W8A8-EPLB" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + <<: *envs + DYNAMIC_EPLB: "true" + server_cmd: *server_cmd + server_cmd_extra: + - "--additional-config" + - '{"enable_weight_nz_layout": true, "eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 1000, "algorithm_execution_interval": 50, "eplb_policy_type": 3}}' + benchmarks: + <<: *benchmarks_acc diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml new file mode 100644 index 00000000000..d1f7f93e299 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-R1-W8A8-HBM.yaml @@ -0,0 +1,42 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "DeepSeek-R1-W8A8-HBM-single" + model: "vllm-ascend/DeepSeek-R1-W8A8" + envs: + HCCL_BUFFSIZE: "1024" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--quantization" + - "ascend" + - "--port" + - "$SERVER_PORT" + - "--data-parallel-size" + - "8" + - "--data-parallel-size-local" + - "8" + - "--data-parallel-rpc-port" + - "13389" + - "--tensor-parallel-size" + - "2" + - "--enable-expert-parallel" + - "--seed" + - "1024" + - "--max-num-seqs" + - "32" + - "--max-model-len" + - "6000" + - "--max-num-batched-tokens" + - "6000" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.92" + - "--no-enable-prefix-caching" + - "--reasoning-parser" + - "deepseek_r1" + - "--enforce-eager" + - "--additional-config" + - '{"ascend_scheduler_config": {"enabled": false}, "torchair_graph_config": {"enabled": false, "enable_multistream_shared_expert": false}}' + benchmarks: diff --git a/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml new file mode 100644 index 00000000000..b237407904f --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/DeepSeek-V3.2-W8A8.yaml @@ -0,0 +1,78 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "DeepSeek-V3.2-W8A8-TP8-DP2" + model: "vllm-ascend/DeepSeek-V3.2-W8A8" + envs: + HCCL_OP_EXPANSION_MODE: "AIV" + OMP_PROC_BIND: "false" + OMP_NUM_THREADS: "1" + HCCL_BUFFSIZE: "1024" + VLLM_ASCEND_ENABLE_MLAPO: "1" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" + VLLM_ENGINE_READY_TIMEOUT_S: "1800" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--enable-expert-parallel" + - "--tensor-parallel-size" + - "8" + - "--data-parallel-size" + - "2" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "8192" + - "--max-num-batched-tokens" + - "8192" + - "--max-num-seqs" + - "4" + - "--trust-remote-code" + - "--quantization" + - "ascend" + - "--gpu-memory-utilization" + - "0.98" + - "--compilation-config" + - '{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}' + - "--speculative-config" + - '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}' + - "--additional-config" + - '{"layer_sharding": ["q_b_proj", "o_proj"]}' + - "--reasoning-parser" + - "deepseek_v3" + - "--tokenizer_mode" + - "deepseek_v32" + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 4096 + batch_size: 8 + baseline: 95 + threshold: 5 + perf_1: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 1 + max_out_len: 1500 + batch_size: 1 + request_rate: 11.2 + baseline: 134 + threshold: 0.97 + perf_2: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 100 + max_out_len: 1500 + batch_size: 4 + request_rate: 11.2 + baseline: 134 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/GLM-4.5.yaml b/tests/e2e/nightly/single_node/models/configs/GLM-4.5.yaml new file mode 100644 index 00000000000..33a5013eb73 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/GLM-4.5.yaml @@ -0,0 +1,72 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + HCCL_BUFFSIZE: "1024" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--no-enable-prefix-caching" + - "--enable-expert-parallel" + - "--tensor-parallel-size" + - "8" + - "--data-parallel-size" + - "2" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "8192" + - "--max-num-batched-tokens" + - "8192" + - "--block-size" + - "16" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 4096 + batch_size: 8 + baseline: 95 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 16 + max_out_len: 1500 + batch_size: 8 + request_rate: 0 + baseline: 1 + threshold: 0.97 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "GLM-4.5-TP8-DP2-fullgraph" + model: "ZhipuAI/GLM-4.5" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation-config" + - '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}' + benchmarks: + <<: *benchmarks + + - name: "GLM-4.5-TP8-DP2-eager" + model: "ZhipuAI/GLM-4.5" + envs: + <<: *envs + server_cmd: *server_cmd + benchmarks: + <<: *benchmarks diff --git a/tests/e2e/nightly/single_node/models/configs/Kimi-K2-Thinking.yaml b/tests/e2e/nightly/single_node/models/configs/Kimi-K2-Thinking.yaml new file mode 100644 index 00000000000..68e9c838aa3 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Kimi-K2-Thinking.yaml @@ -0,0 +1,52 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Kimi-K2-Thinking-TP16-Case" + model: "moonshotai/Kimi-K2-Thinking" + envs: + HCCL_BUFFSIZE: "1024" + TASK_QUEUE_ENABLE: "1" + OMP_PROC_BIND: "false" + HCCL_OP_EXPANSION_MODE: "AIV" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--tensor-parallel-size" + - "16" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "8192" + - "--max-num-batched-tokens" + - "8192" + - "--max-num-seqs" + - "12" + - "--gpu-memory-utilization" + - "0.9" + - "--trust-remote-code" + - "--enable-expert-parallel" + - "--no-enable-prefix-caching" + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 4096 + batch_size: 32 + baseline: 95 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 512 + max_out_len: 256 + batch_size: 64 + trust_remote_code: true + request_rate: 11.2 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/MTPX-DeepSeek-R1-0528-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/MTPX-DeepSeek-R1-0528-W8A8.yaml new file mode 100644 index 00000000000..30fec5ad298 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/MTPX-DeepSeek-R1-0528-W8A8.yaml @@ -0,0 +1,90 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "100" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + VLLM_RPC_TIMEOUT: "3600000" + VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS: "3600000" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--seed" + - "1024" + - "--no-enable-prefix-caching" + - "--data-parallel-size" + - "2" + - "--tensor-parallel-size" + - "8" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--max-num-seqs" + - "14" + - "--trust-remote-code" + +_benchmarks_gsm8k: &benchmarks_gsm8k + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 95 + threshold: 5 + +_benchmarks_aime: &benchmarks_aime + acc: + case_type: accuracy + dataset_path: vllm-ascend/aime2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 86.67 + threshold: 7 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "MTPX-DeepSeek-R1-0528-W8A8-mtp2" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "4096" + - "--speculative-config" + - '{"num_speculative_tokens": 2, "method": "mtp"}' + - "--gpu-memory-utilization" + - "0.92" + benchmarks: + <<: *benchmarks_gsm8k + + - name: "MTPX-DeepSeek-R1-0528-W8A8-mtp3" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + <<: *envs + HCCL_OP_EXPANSION_MODE: "AIV" + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "2048" + - "--speculative-config" + - '{"num_speculative_tokens": 3, "method": "mtp"}' + - "--gpu-memory-utilization" + - "0.9" + - "--compilation-config" + - '{"cudagraph_capture_sizes": [56], "cudagraph_mode": "FULL_DECODE_ONLY"}' + benchmarks: + <<: *benchmarks_aime diff --git a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml new file mode 100644 index 00000000000..ddfbcab67f6 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-DeepSeek-R1-0528-W8A8.yaml @@ -0,0 +1,77 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "prefix-cache-deepseek-r1-0528-w8a8" + model: "vllm-ascend/DeepSeek-R1-0528-W8A8" + envs: + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--quantization" + - "ascend" + - "--data-parallel-size" + - "2" + - "--tensor-parallel-size" + - "8" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--seed" + - "1024" + - "--max-model-len" + - "5200" + - "--max-num-batched-tokens" + - "4096" + - "--max-num-seqs" + - "16" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--additional-config" + - '{"enable_weight_nz_layout": true}' + - "--speculative-config" + - '{"num_speculative_tokens": 1, "method": "mtp"}' + test_content: + - "benchmark_comparisons" + benchmark_comparisons_args: + - metric: "TTFT" + baseline: "prefix0" + target: "prefix75" + ratio: 0.8 + operator: "<" + benchmarks: + warm_up: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in1024-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 2 + batch_size: 1000 + baseline: 0 + threshold: 0.97 + prefix0: + case_type: performance + dataset_path: vllm-ascend/prefix0-in3500-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 1500 + batch_size: 18 + baseline: 1 + threshold: 0.97 + prefix75: + case_type: performance + dataset_path: vllm-ascend/prefix75-in3500-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 1500 + batch_size: 18 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml new file mode 100644 index 00000000000..6ead3525237 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Prefix-Cache-Qwen3-32B-Int8.yaml @@ -0,0 +1,70 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "prefix-cache-qwen3-32b-w8a8" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--quantization" + - "ascend" + - "--reasoning-parser" + - "qwen3" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "8192" + - "--max-num-batched-tokens" + - "8192" + - "--max-num-seqs" + - "256" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--additional-config" + - '{"enable_weight_nz_layout": true}' + test_content: + - "benchmark_comparisons" + benchmark_comparisons_args: + - metric: "TTFT" + baseline: "prefix0" + target: "prefix75" + ratio: 0.8 + operator: "<" + benchmarks: + warm_up: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in1024-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 2 + batch_size: 1000 + baseline: 0 + threshold: 0.97 + prefix0: + case_type: performance + dataset_path: vllm-ascend/prefix0-in3500-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 1500 + batch_size: 48 + baseline: 1 + threshold: 0.97 + prefix75: + case_type: performance + dataset_path: vllm-ascend/prefix75-in3500-bs210 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 210 + max_out_len: 1500 + batch_size: 48 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/QwQ-32B.yaml b/tests/e2e/nightly/single_node/models/configs/QwQ-32B.yaml new file mode 100644 index 00000000000..3b655dece09 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/QwQ-32B.yaml @@ -0,0 +1,78 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + TASK_QUEUE_ENABLE: "1" + OMP_PROC_BIND: "false" + HCCL_OP_EXPANSION_MODE: "AIV" + VLLM_ASCEND_ENABLE_FLASHCOMM: "1" + VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "36864" + - "--max-num-batched-tokens" + - "36864" + - "--block-size" + - "128" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--reasoning-parser" + - "deepseek_r1" + - "--distributed_executor_backend" + - "mp" + - "--additional-config" + - '{"weight_prefetch_config":{"enabled":true}}' + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 95 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 240 + max_out_len: 1500 + batch_size: 60 + baseline: 1 + threshold: 0.97 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "QwQ-32B-aclgraph" + model: "Qwen/QwQ-32B" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation_config" + - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}' + benchmarks: + <<: *benchmarks + + - name: "QwQ-32B-single" + model: "Qwen/QwQ-32B" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--enforce-eager" + benchmarks: diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-32B-Instruct.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-32B-Instruct.yaml new file mode 100644 index 00000000000..dc4ac507bc5 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-32B-Instruct.yaml @@ -0,0 +1,63 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen2.5-VL-32B-Instruct-a3" + model: "Qwen/Qwen2.5-VL-32B-Instruct" + envs: + TASK_QUEUE_ENABLE: "1" + VLLM_ASCEND_ENABLE_NZ: "0" + HCCL_OP_EXPANSION_MODE: "AIV" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--no-enable-prefix-caching" + - "--mm-processor-cache-gb" + - "0" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "30000" + - "--max-num-batched-tokens" + - "40000" + - "--max-num-seqs" + - "400" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.8" + - "--compilation_config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + test_content: + - "completion" + - "image" + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/textvqa-lite + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + max_out_len: 2048 + batch_size: 128 + baseline: 76.22 + temperature: 0 + top_k: -1 + top_p: 1 + repetition_penalty: 1 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/textvqa-perf-1080p + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + num_prompts: 512 + max_out_len: 256 + batch_size: 128 + temperature: 0 + top_k: -1 + top_p: 1 + repetition_penalty: 1 + request_rate: 0 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct-EPD.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct-EPD.yaml new file mode 100644 index 00000000000..10cfe2366b4 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct-EPD.yaml @@ -0,0 +1,92 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen2.5-VL-7B-Instruct-epd" + model: "Qwen/Qwen2.5-VL-7B-Instruct" + service_mode: "epd" + envs: + ENCODE_PORT: "DEFAULT_PORT" + PD_PORT: "DEFAULT_PORT" + PROXY_PORT: "DEFAULT_PORT" + epd_server_cmds: + - - "--port" + - "$ENCODE_PORT" + - "--model" + - "Qwen/Qwen2.5-VL-7B-Instruct" + - "--gpu-memory-utilization" + - "0.01" + - "--tensor-parallel-size" + - "1" + - "--enforce-eager" + - "--no-enable-prefix-caching" + - "--max-model-len" + - "10000" + - "--max-num-batched-tokens" + - "10000" + - "--max-num-seqs" + - "1" + - "--ec-transfer-config" + - '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}' + - - "--port" + - "$PD_PORT" + - "--model" + - "Qwen/Qwen2.5-VL-7B-Instruct" + - "--gpu-memory-utilization" + - "0.95" + - "--tensor-parallel-size" + - "1" + - "--enforce-eager" + - "--max-model-len" + - "10000" + - "--max-num-batched-tokens" + - "10000" + - "--max-num-seqs" + - "128" + - "--ec-transfer-config" + - '{"ec_connector_extra_config":{"shared_storage_path":"/dev/shm/epd/storage"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}' + epd_proxy_args: + - "--host" + - "127.0.0.1" + - "--port" + - "$PROXY_PORT" + - "--encode-servers-urls" + - "http://localhost:$ENCODE_PORT" + - "--decode-servers-urls" + - "http://localhost:$PD_PORT" + - "--prefill-servers-urls" + - "disable" + test_content: + benchmarks: + warm_up: + case_type: performance + dataset_path: vllm-ascend/textvqa-perf-1080p + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + num_prompts: 50 + max_out_len: 20 + batch_size: 32 + request_rate: 0 + baseline: 1 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/textvqa-lite + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + max_out_len: 2048 + batch_size: 128 + baseline: 82.05 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/textvqa-perf-1080p + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + num_prompts: 512 + max_out_len: 256 + batch_size: 128 + request_rate: 0 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct.yaml new file mode 100644 index 00000000000..eee1b4e788e --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen2.5-VL-7B-Instruct.yaml @@ -0,0 +1,55 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen2.5-VL-7B-Instruct" + model: "Qwen/Qwen2.5-VL-7B-Instruct" + envs: + TASK_QUEUE_ENABLE: "1" + VLLM_ASCEND_ENABLE_NZ: "0" + HCCL_OP_EXPANSION_MODE: "AIV" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--no-enable-prefix-caching" + - "--mm-processor-cache-gb" + - "0" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "30000" + - "--max-num-batched-tokens" + - "40000" + - "--max-num-seqs" + - "400" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.8" + - "--compilation_config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + test_content: + - "completion" + - "image" + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/textvqa-lite + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + max_out_len: 2048 + batch_size: 128 + baseline: 82.05 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/textvqa-perf-1080p + request_conf: vllm_api_stream_chat + dataset_conf: textvqa/textvqa_gen_base64 + num_prompts: 512 + max_out_len: 256 + batch_size: 128 + request_rate: 0 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-235B-A22B-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-235B-A22B-W8A8.yaml new file mode 100644 index 00000000000..154ad3293d8 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-235B-A22B-W8A8.yaml @@ -0,0 +1,85 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + VLLM_ASCEND_ENABLE_FLASHCOMM1: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--async-scheduling" + - "--data-parallel-size" + - "4" + - "--tensor-parallel-size" + - "4" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--max-num-batched-tokens" + - "8192" + - "--max-num-seqs" + - "12" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + top_k: 20 + baseline: 95 + threshold: 5 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-235B-A22B-W8A8-full_graph" + model: "vllm-ascend/Qwen3-235B-A22B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation-config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + benchmarks: + <<: *benchmarks + + - name: "Qwen3-235B-A22B-W8A8-piecewise" + model: "vllm-ascend/Qwen3-235B-A22B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation-config" + - '{"cudagraph_mode": "PIECEWISE"}' + benchmarks: + <<: *benchmarks + + - name: "Qwen3-235B-A22B-W8A8-EPLB" + model: "vllm-ascend/Qwen3-235B-A22B-W8A8" + envs: + <<: *envs + DYNAMIC_EPLB: "true" + server_cmd: *server_cmd + server_cmd_extra: + - "--additional-config" + - '{"eplb_config": {"dynamic_eplb": "true", "expert_heat_collection_interval": 600, "algorithm_execution_interval": 50, "num_redundant_experts": 16, "eplb_policy_type": 2}}' + - "--compilation-config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + benchmarks: + <<: *benchmarks diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-30B-A3B-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-30B-A3B-W8A8.yaml new file mode 100644 index 00000000000..1f65a05390a --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-30B-A3B-W8A8.yaml @@ -0,0 +1,46 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-30B-A3B-W8A8-TP1" + model: "vllm-ascend/Qwen3-30B-A3B-W8A8" + envs: + OMP_PROC_BIND: "false" + OMP_NUM_THREADS: "10" + HCCL_BUFFSIZE: "1024" + HCCL_OP_EXPANSION_MODE: "AIV" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--quantization" + - "ascend" + - "--async-scheduling" + - "--no-enable-prefix-caching" + - "--tensor-parallel-size" + - "1" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "5600" + - "--max-num-batched-tokens" + - "16384" + - "--max-num-seqs" + - "100" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--compilation-config" + - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' + benchmarks: + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 180 + max_out_len: 1500 + batch_size: 45 + request_rate: 0 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A2.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A2.yaml new file mode 100644 index 00000000000..fa894d5e839 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A2.yaml @@ -0,0 +1,79 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + VLLM_ASCEND_ENABLE_FLASHCOMM: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--no-enable-prefix-caching" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--max-num-batched-tokens" + - "40960" + - "--block-size" + - "128" + - "--trust-remote-code" + - "--reasoning-parser" + - "qwen3" + - "--gpu-memory-utilization" + - "0.9" + - "--async-scheduling" + - "--additional-config" + - '{"weight_prefetch_config":{"enabled":true}}' + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/aime2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 83.33 + threshold: 7 + + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 288 + max_out_len: 1500 + batch_size: 72 + baseline: 1 + threshold: 0.97 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-32B-W8A8-aclgraph-a2" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation-config" + - '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}' + benchmarks: + <<: *benchmarks + + - name: "Qwen3-32B-W8A8-single-a2" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--enforce-eager" + benchmarks: diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml new file mode 100644 index 00000000000..0b396ae031d --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8-A3-Feature-Stack3.yaml @@ -0,0 +1,69 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-32B-W8A8-a3-feature-stack3" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + VLLM_USE: "1" + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + OMP_PROC_BIND: "false" + VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE: "1" + VLLM_ASCEND_ENABLE_FLASHCOMM: "1" + SERVER_PORT: "DEFAULT_PORT" + prompts: + - "9.11 and 9.8, which is greater?" + api_keyword_args: + chat_template_kwargs: + enable_thinking: true + server_cmd: + - "--quantization" + - "ascend" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--trust-remote-code" + - "--reasoning-parser" + - "qwen3" + - "--distributed_executor_backend" + - "mp" + - "--gpu-memory-utilization" + - "0.9" + - "--block-size" + - "128" + - "--max-num-seqs" + - "256" + - "--enforce-eager" + - "--max-model-len" + - "35840" + - "--max-num-batched-tokens" + - "35840" + - "--additional-config" + - '{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}' + - "--compilation-config" + - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}' + test_content: + - "chat_completion" + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt + max_out_len: 10240 + batch_size: 32 + baseline: 96 + threshold: 4 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 240 + max_out_len: 1500 + batch_size: 60 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml new file mode 100644 index 00000000000..0a3a90e552d --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B-Int8.yaml @@ -0,0 +1,78 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + TASK_QUEUE_ENABLE: "1" + HCCL_OP_EXPANSION_MODE: "AIV" + VLLM_ASCEND_ENABLE_FLASHCOMM: "1" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--quantization" + - "ascend" + - "--no-enable-prefix-caching" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--max-num-batched-tokens" + - "40960" + - "--block-size" + - "128" + - "--trust-remote-code" + - "--reasoning-parser" + - "qwen3" + - "--gpu-memory-utilization" + - "0.9" + - "--async-scheduling" + - "--additional-config" + - '{"weight_prefetch_config":{"enabled":true}}' + +_benchmarks: &benchmarks + acc: + case_type: accuracy + dataset_path: vllm-ascend/aime2024 + request_conf: vllm_api_general_chat + dataset_conf: aime2024/aime2024_gen_0_shot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 83.33 + threshold: 7 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 304 + max_out_len: 1500 + batch_size: 76 + baseline: 1 + threshold: 0.97 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-32B-W8A8-aclgraph-a3" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--compilation-config" + - '{"cudagraph_mode":"FULL_DECODE_ONLY","cudagraph_capture_sizes":[1,12,16,20,24,32,48,60,64,68,72,76,80]}' + benchmarks: + <<: *benchmarks + + - name: "Qwen3-32B-W8A8-single-a3" + model: "vllm-ascend/Qwen3-32B-W8A8" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--enforce-eager" + benchmarks: diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-32B.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B.yaml new file mode 100644 index 00000000000..330d44b3d76 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-32B.yaml @@ -0,0 +1,51 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-32B-TP4" + model: "Qwen/Qwen3-32B" + envs: + TASK_QUEUE_ENABLE: "1" + OMP_PROC_BIND: "false" + HCCL_OP_EXPANSION_MODE: "AIV" + PAGED_ATTENTION_MASK_LEN: "5500" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--no-enable-prefix-caching" + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "36864" + - "--max-num-batched-tokens" + - "36864" + - "--block-size" + - "128" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.9" + - "--additional-config" + - '{"enable_weight_nz_layout":true}' + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 95 + threshold: 5 + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 80 + max_out_len: 1500 + batch_size: 20 + request_rate: 0 + baseline: 1 + threshold: 0.97 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-A2.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-A2.yaml new file mode 100644 index 00000000000..56a653fefdf --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-A2.yaml @@ -0,0 +1,75 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--trust-remote-code" + - "--async-scheduling" + - "--no-enable-prefix-caching" + - "--enable-expert-parallel" + - "--gpu-memory-utilization" + - "0.8" + - "--max-num-seqs" + - "64" + +_benchmarks: &benchmarks + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 256 + max_out_len: 1500 + batch_size: 64 + baseline: 1 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + top_k: 20 + baseline: 95 + threshold: 5 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a2" + model: "Qwen/Qwen3-Next-80B-A3B-Instruct" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "8192" + benchmarks: + <<: *benchmarks + + - name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a2" + model: "Qwen/Qwen3-Next-80B-A3B-Instruct" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "32768" + benchmarks: + <<: *benchmarks diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-W8A8.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-W8A8.yaml new file mode 100644 index 00000000000..3deddc75192 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct-W8A8.yaml @@ -0,0 +1,45 @@ +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-Next-80B-A3B-Instruct-W8A8" + model: "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8" + envs: + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + SERVER_PORT: "DEFAULT_PORT" + server_cmd: + - "--quantization" + - "ascend" + - "--async-scheduling" + - "--no-enable-prefix-caching" + - "--data-parallel-size" + - "1" + - "--tensor-parallel-size" + - "4" + - "--enable-expert-parallel" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--max-num-batched-tokens" + - "8192" + - "--max-num-seqs" + - "32" + - "--trust-remote-code" + - "--gpu-memory-utilization" + - "0.65" + - "--compilation-config" + - '{"cudagraph_capture_sizes": [32]}' + benchmarks: + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + baseline: 95 + threshold: 5 diff --git a/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct.yaml b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct.yaml new file mode 100644 index 00000000000..31d4aaf4d41 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/configs/Qwen3-Next-80B-A3B-Instruct.yaml @@ -0,0 +1,75 @@ +# ========================================== +# Shared Configurations +# ========================================== + +_envs: &envs + OMP_NUM_THREADS: "10" + OMP_PROC_BIND: "false" + HCCL_BUFFSIZE: "1024" + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + SERVER_PORT: "DEFAULT_PORT" + +_server_cmd: &server_cmd + - "--tensor-parallel-size" + - "4" + - "--port" + - "$SERVER_PORT" + - "--max-model-len" + - "40960" + - "--trust-remote-code" + - "--async-scheduling" + - "--no-enable-prefix-caching" + - "--enable-expert-parallel" + - "--gpu-memory-utilization" + - "0.8" + - "--max-num-seqs" + - "64" + +_benchmarks: &benchmarks + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 256 + max_out_len: 1500 + batch_size: 64 + baseline: 1 + threshold: 0.97 + acc: + case_type: accuracy + dataset_path: vllm-ascend/gsm8k-lite + request_conf: vllm_api_general_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_chat_prompt + max_out_len: 32768 + batch_size: 32 + top_k: 20 + baseline: 95 + threshold: 5 + +# ========================================== +# ACTUAL TEST CASES +# ========================================== + +test_cases: + - name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-8192-a3" + model: "Qwen/Qwen3-Next-80B-A3B-Instruct" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "8192" + benchmarks: + <<: *benchmarks + + - name: "Qwen3-Next-80B-A3B-Instruct-aclgraph-32768-a3" + model: "Qwen/Qwen3-Next-80B-A3B-Instruct" + envs: + <<: *envs + server_cmd: *server_cmd + server_cmd_extra: + - "--max-num-batched-tokens" + - "32768" + benchmarks: + <<: *benchmarks diff --git a/tests/e2e/nightly/single_node/models/scripts/GUIDE_AND_TEMPLATE.md b/tests/e2e/nightly/single_node/models/scripts/GUIDE_AND_TEMPLATE.md new file mode 100644 index 00000000000..803cb800eec --- /dev/null +++ b/tests/e2e/nightly/single_node/models/scripts/GUIDE_AND_TEMPLATE.md @@ -0,0 +1,312 @@ +# vLLM-Ascend Single-Node E2E Test Developer Guide + +This document is intended to help developers understand the architecture of the single-node E2E (End-to-End) testing framework in `vllm-ascend`, how to run test scripts, and how to add custom testing functionality by writing YAML configuration files and extending the code. + +## 1. Test Architecture Overview + +To achieve high readability, extensibility, and decoupling of configuration from code, the single-node E2E test adopts a **"YAML-driven + Dispatcher"** architectural structure. + +It consists of the following core components: + +* **Configuration Parser (`single_node_config.py`)**: Responsible for reading `models/configs/*.yaml` files and parsing them into a strongly-typed `@dataclass` (`SingleNodeConfig`) via `SingleNodeConfigLoader`, while handling regex replacement for environment variables. +* **Service Manager Framework (`test_single_node.py` and `conftest.py`)**: Based on the `service_mode` (`openai` or `epd`), it utilizes context managers to safely start/stop server processes. +* **Test Function Dispatcher (`TEST_HANDLERS` Registry)**: Specific test logic is encapsulated into independent functions and registered in the global `TEST_HANDLERS` dictionary. +* **Performance Benchmarking (`_run_benchmarks`)**: Calls `aisbench` for performance and TTFT testing based on the `benchmarks` parameters in the YAML. + +### 1.1 Key Files and Responsibilities + +* `tests/e2e/nightly/single_node/models/scripts/single_node_config.py` + * Defines `SingleNodeConfig` and `SingleNodeConfigLoader` + * Loads YAML from `tests/e2e/nightly/single_node/models/configs/` + * Auto-assigns ports when `envs` contains `DEFAULT_PORT` / missing values + * Expands `$VAR` / `${VAR}` placeholders inside commands via `_expand_values` + +* `tests/e2e/nightly/single_node/models/scripts/test_single_node.py` + * Declares `configs = SingleNodeConfigLoader.from_yaml_cases()` (loaded at import time) + * `pytest.mark.parametrize("config", configs, ids=[config.name for config in configs])` runs one test per YAML case + * Controls server lifecycle via context managers + * Dispatches `test_content` to functions registered in `TEST_HANDLERS` + * Runs `aisbench` and optional benchmark assertions + +### 1.2 End-to-End Flow (High Level) + +```txt +pytest starts + | + v +import tests/e2e/nightly/single_node/models/scripts/test_single_node.py + | + v +configs = SingleNodeConfigLoader.from_yaml_cases() + | + v +pytest parametrize("config", configs) # one config == one test case + | + v +test_single_node(config) + | + +-----------------------------------------------+ + | Start service (depends on service_mode) | + | | + | openai: start one vLLM OpenAI-compatible | + | service process | + | epd: start (encode service + decode/PD | + | service) + start proxy process | + +-----------------------------------------------+ + | + v +Run test phases (test_content) + | + v +Optional benchmarks (if benchmarks is configured) + | + v +Shutdown all started processes + +Notes: +- One YAML file may contain multiple test_cases; pytest will run them one by one. +- The framework is "YAML-driven": changes are typically done by editing YAML rather than editing Python code. +``` + +### 1.3 Function Call Relationships (Dispatcher) + +`test_content` is a list of “phases”. Each phase maps to one handler function. + +```txt +For each test_case: + + test_content (list of phases) + | + v + [Dispatcher] + | + +--> phase "completion" -> send completion request(s) + | + +--> phase "chat_completion" -> send chat completion request(s) + | + +--> phase "image" -> send multimodal image request(s) + | + \--> (extendable) add your own phase by registering a new handler + +After phases: + if benchmarks is configured -> run aisbench + +Notes: +- The dispatcher only controls "what to run"; service lifecycle is controlled by the service manager. +- Phases are intentionally small & composable so you can reuse them across YAML cases. +``` + +## 2. Running and Debugging Steps + +### 2.1 Dependencies + +Ensure you are in an NPU environment and have installed `pytest`, `pyyaml`, `openai`, and `aisbench`. + +### 2.2 Local Execution + +The framework uses the `CONFIG_YAML_PATH` environment variable to specify the configuration file. + +```bash +# Switch to the project root directory +cd /vllm-workspace/vllm-ascend + +# Run a specific yaml test +export CONFIG_YAML_PATH="Qwen3-32B.yaml" +pytest -sv tests/e2e/nightly/single_node/models/scripts/test_single_node.py +``` + +### 2.3 Tips for Debugging + +* Only run a subset of cases: `pytest -sv ... -k ` (matches case names in the report output) +* Stop on first failure: `pytest -sv ... -x` +* Keep server logs visible: use `-s` (already included in `-sv`) and increase log verbosity via standard Python logging configuration if needed. + +## 3. How to Write YAML Configuration Files + +### 3.1 File Location and Selection Rules + +* YAML files live under: `tests/e2e/nightly/single_node/models/configs/` +* Selected by env var: `CONFIG_YAML_PATH=.yaml` +* If not set, the loader uses `SingleNodeConfigLoader.DEFAULT_CONFIG_NAME` + +### 3.2 Field Descriptions + +| Field Name | Type | Required | Default Value | Description | +| :--------------- | :--------- | :------- | :-------------- | :------------------------------------------------------------------ | +| `test_cases` | list | **Yes** | - | List of test case objects | +| `name` | string | **Yes** | - | Human-readable case ID shown in pytest output and logs | +| `model` | string | **Yes** | - | Model name or local path | +| `service_mode` | string | No | `openai` | Service mode: `openai` or `epd` (disaggregated) | +| `envs` | map | **Yes** | `{}` | Environment variables for the server process | +| `server_cmd` | list | Cond. | `[]` | vLLM startup arguments (Required for non-EPD) | +| `server_cmd_extra` | list | No | `[]` | Extra vLLM startup arguments appended after `server_cmd` | +| `prompts` | list | No | built-in default | Prompts for completion/chat tests | +| `api_keyword_args` | map | No | built-in default | OpenAI API keyword args (e.g., `max_tokens`, sampling params) | +| `test_content` | list | No | `["completion"]` | Test phases: `completion`, `chat_completion`, `image` etc. | +| `benchmarks` | map | No | `{}` | Configuration for `aisbench` performance verification | +| `epd_server_cmds`| list[list] | Cond. | `[]` | (EPD Only) Command arrays for starting dual Encode/Decode processes | +| `epd_proxy_args` | list | Cond. | `[]` | (EPD Only) Startup arguments for the EPD routing gateway | + +**Notes / Behaviors** + +* `name` is mandatory and must be a non-empty string. + * It is used directly as pytest case id (e.g., `test_single_node[DeepSeek-R1-0528-W8A8-single]`). + * It is also printed in `[single-node][START]` marker for log navigation. + +* `envs` (ports): the config object recognizes these keys: `SERVER_PORT`, `ENCODE_PORT`, `PD_PORT`, `PROXY_PORT`. + * If a port key is missing or set to `DEFAULT_PORT`, it will be automatically filled with an available open port. + * `$SERVER_PORT` / `${SERVER_PORT}` placeholders in commands will be expanded using `envs`. + +* `server_cmd` vs `server_cmd_extra`: + * YAML can define `server_cmd_extra` to append additional args after `server_cmd`. + * The loader merges them into a single `server_cmd` list. + +* Extra fields: + * Any non-standard fields in a case are stored in `config.extra_config`. + * This is how extension configs are passed through without changing the dataclass. + +### 3.3 YAML Examples + +#### Single-Case (similar to DeepSeek-R1-W8A8-HBM) + +```yaml +test_cases: + - name: "" + model: "" + + # Optional: The default values are as follows + prompts: + - "San Francisco is a" + api_keyword_args: + max_tokens: 10 + + envs: + SERVER_PORT: "DEFAULT_PORT" + # Add only what you need. + + server_cmd: + - "--port" + - "$SERVER_PORT" + # plus your vLLM serve args... + + # Optional: omit -> defaults to ["completion"] + test_content: + - "chat_completion" + + # Optional: leave empty if you don't run aisbench + benchmarks: +``` + +#### Multi-Case + Shared Anchors + +```yaml +_envs: &envs + SERVER_PORT: "DEFAULT_PORT" + # shared envs... + +_server_cmd: &server_cmd + - "--port" + - "$SERVER_PORT" + # shared vLLM serve args... + +_benchmarks: &benchmarks + perf: + case_type: performance + dataset_path: vllm-ascend/GSM8K-in3500-bs400 + request_conf: vllm_api_stream_chat + dataset_conf: gsm8k/gsm8k_gen_0_shot_cot_str_perf + num_prompts: 400 + max_out_len: 1500 + batch_size: 1000 + baseline: 1 + threshold: 0.97 + +test_cases: + - name: "case-a" + model: "" + envs: + <<: *envs + DYNAMIC_EPLB: "true" + # private envs... + server_cmd: *server_cmd + server_cmd_extra: + - "--enforce-eager" + benchmarks: + + - name: "case-b" + model: "" + envs: + <<: *envs + server_cmd: *server_cmd + benchmarks: + <<: *benchmarks_acc +``` + +#### EPD / Disaggregated Case + +```yaml +test_cases: + - name: "" + model: "" + service_mode: "epd" + envs: + ENCODE_PORT: "DEFAULT_PORT" + PD_PORT: "DEFAULT_PORT" + PROXY_PORT: "DEFAULT_PORT" + + epd_server_cmds: + - ["--port", "$ENCODE_PORT", "--model", ""] + - ["--port", "$PD_PORT", "--model", ""] + + epd_proxy_args: + - "--host" + - "127.0.0.1" + - "--port" + - "$PROXY_PORT" + - "--encode-servers-urls" + - "http://localhost:$ENCODE_PORT" + - "--decode-servers-urls" + - "http://localhost:$PD_PORT" + - "--prefill-servers-urls" + - "disable" + + test_content: + - "chat_completion" +``` + +## 4. How to Add Custom Tests (Extension) + +### Step 1: Write your test logic in `test_single_node.py` + +```python +async def run_video_test(config: SingleNodeConfig, server: 'RemoteOpenAIServer | DisaggEpdProxy') -> None: + client = server.get_async_client() + # Your custom logic here... +``` + +### Step 2: Register your function in `TEST_HANDLERS` + +```python +TEST_HANDLERS = { + "completion": run_completion_test, + "video": run_video_test, # Registered! +} +``` + +### Step 3: Enable in YAML + +```yaml + test_content: + - "completion" + - "video" +``` + +## 5. Checklist (Before Submitting a New YAML) + +* `test_cases` exists and is a list +* Each case contains required fields for its `service_mode` + * Common required: `name`, `model`, `envs` + * `openai`: `server_cmd` + * `epd`: `epd_server_cmds`, `epd_proxy_args` +* Port envs are set to `DEFAULT_PORT` (or to explicit free ports) +* If using `benchmarks`, ensure each benchmark case includes required aisbench fields (e.g., `case_type`, `dataset_path`, `request_conf`, `dataset_conf`, `max_out_len`, `batch_size`) diff --git a/tests/e2e/nightly/single_node/models/scripts/__init__.py b/tests/e2e/nightly/single_node/models/scripts/__init__.py new file mode 100644 index 00000000000..146a786d94d --- /dev/null +++ b/tests/e2e/nightly/single_node/models/scripts/__init__.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# This file is a part of the vllm-ascend project. +# diff --git a/tests/e2e/nightly/single_node/models/scripts/single_node_config.py b/tests/e2e/nightly/single_node/models/scripts/single_node_config.py new file mode 100644 index 00000000000..86c317c5441 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/scripts/single_node_config.py @@ -0,0 +1,183 @@ +import logging +import os +import re +from dataclasses import dataclass, field +from typing import Any + +import yaml +from vllm.utils.network_utils import get_open_port + +CONFIG_BASE_PATH = "tests/e2e/nightly/single_node/models/configs" + +logger = logging.getLogger(__name__) + +# Default prompts and API args fallback +PROMPTS = [ + "San Francisco is a", +] + +API_KEYWORD_ARGS = { + "max_tokens": 10, +} + + +@dataclass +class SingleNodeConfig: + name: str + model: str + envs: dict[str, Any] = field(default_factory=dict) + prompts: list[str] = field(default_factory=lambda: PROMPTS) + api_keyword_args: dict[str, Any] = field(default_factory=lambda: API_KEYWORD_ARGS) + benchmarks: dict[str, Any] = field(default_factory=dict) + server_cmd: list[str] = field(default_factory=list) + test_content: list[str] = field(default_factory=lambda: ["completion"]) + service_mode: str = "openai" + epd_server_cmds: list[list[str]] = field(default_factory=list) + epd_proxy_args: list[str] = field(default_factory=list) + extra_config: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + port_keys = ["SERVER_PORT", "ENCODE_PORT", "PD_PORT", "PROXY_PORT"] + for env_key in port_keys: + if self.envs.get(env_key) in ["DEFAULT_PORT", None]: + self.envs[env_key] = str(get_open_port()) + + if self.prompts is None: + self.prompts = PROMPTS + if self.api_keyword_args is None: + self.api_keyword_args = API_KEYWORD_ARGS + if self.benchmarks is None: + self.benchmarks = {} + if self.test_content is None: + self.test_content = [] + + self.server_cmd = self._expand_values(self.server_cmd or [], self.envs) + self.epd_server_cmds = [self._expand_values(cmd, self.envs) for cmd in self.epd_server_cmds] + self.epd_proxy_args = self._expand_values(self.epd_proxy_args or [], self.envs) + + for key, value in self.extra_config.items(): + setattr(self, key, value) + + @staticmethod + def _expand_values(values: list[str], envs: dict[str, Any]) -> list[str]: + """Interpolate $VAR/${VAR} placeholders with provided env values.""" + pattern = re.compile(r"\$(\w+)|\$\{(\w+)\}") + + def repl(m: re.Match[str]) -> str: + key = m.group(1) or m.group(2) + return str(envs.get(key, m.group(0))) + + return [pattern.sub(repl, str(arg)) for arg in values] + + def _get_required_port(self, key: str) -> int: + value = self.envs.get(key) + if value is None: + raise ValueError(f"Missing required port env: {key}") + return int(value) + + @property + def server_port(self) -> int: + return self._get_required_port("SERVER_PORT") + + @property + def encode_port(self) -> int: + return self._get_required_port("ENCODE_PORT") + + @property + def pd_port(self) -> int: + return self._get_required_port("PD_PORT") + + @property + def proxy_port(self) -> int: + return self._get_required_port("PROXY_PORT") + + +class SingleNodeConfigLoader: + """Load SingleNodeConfig from yaml file.""" + + DEFAULT_CONFIG_NAME = "Kimi-K2-Thinking.yaml" + STANDARD_CASE_FIELDS = { + "name", + "model", + "envs", + "prompts", + "api_keyword_args", + "benchmarks", + "service_mode", + "server_cmd", + "server_cmd_extra", + "test_content", + "epd_server_cmds", + "epd_proxy_args", + } + + @classmethod + def from_yaml_cases(cls, yaml_path: str | None = None) -> list[SingleNodeConfig]: + config = cls._load_yaml(yaml_path) + + if "test_cases" not in config: + raise KeyError("test_cases field is required in config yaml") + + cases = config.get("test_cases") + if not isinstance(cases, list): + raise TypeError("test_cases must be a list") + cls._validate_para(cases) + + return cls._parse_test_cases(cases) + + @classmethod + def _load_yaml(cls, yaml_path: str | None) -> dict[str, Any]: + if not yaml_path: + yaml_path = os.getenv("CONFIG_YAML_PATH", cls.DEFAULT_CONFIG_NAME) + + full_path = os.path.join(CONFIG_BASE_PATH, yaml_path) + logger.info("Loading config yaml: %s", full_path) + + with open(full_path) as f: + return yaml.safe_load(f) + + @staticmethod + def _validate_para(cases: list[dict[str, Any]]) -> None: + if not cases: + raise ValueError("test_cases is empty") + for case in cases: + mode = case.get("service_mode", "openai") + required = ["name", "model", "envs"] + if mode == "epd": + required.extend(["epd_server_cmds", "epd_proxy_args"]) + else: + required.append("server_cmd") + missing = [k for k in required if k not in case] + if missing: + raise KeyError(f"Missing required config fields: {missing}") + + if not isinstance(case["name"], str) or not case["name"].strip(): + raise ValueError("test case field 'name' must be a non-empty string") + + @classmethod + def _parse_test_cases(cls, cases: list[dict[str, Any]]) -> list[SingleNodeConfig]: + result: list[SingleNodeConfig] = [] + for case in cases: + server_cmd = case.get("server_cmd", []) + server_cmd_extra = case.get("server_cmd_extra", []) + full_cmd = list(server_cmd) + list(server_cmd_extra) + extra_case_fields = {key: value for key, value in case.items() if key not in cls.STANDARD_CASE_FIELDS} + + # Safe parsing mapping + result.append( + SingleNodeConfig( + name=case["name"], + model=case["model"], + envs=case.get("envs", {}), + server_cmd=full_cmd, + epd_server_cmds=case.get("epd_server_cmds", []), + epd_proxy_args=case.get("epd_proxy_args", []), + benchmarks=case.get("benchmarks", {}), + prompts=case.get("prompts", PROMPTS), + api_keyword_args=case.get("api_keyword_args", API_KEYWORD_ARGS), + test_content=case.get("test_content", ["completion"]), + service_mode=case.get("service_mode", "openai"), + extra_config=extra_case_fields, + ) + ) + return result diff --git a/tests/e2e/nightly/single_node/models/scripts/test_single_node.py b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py new file mode 100644 index 00000000000..a2f3822bc23 --- /dev/null +++ b/tests/e2e/nightly/single_node/models/scripts/test_single_node.py @@ -0,0 +1,165 @@ +import logging +from typing import Any + +import openai +import pytest + +from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer, RemoteOpenAIServer +from tests.e2e.nightly.single_node.models.scripts.single_node_config import ( + SingleNodeConfig, + SingleNodeConfigLoader, +) +from tools.aisbench import run_aisbench_cases + +logger = logging.getLogger(__name__) + +configs = SingleNodeConfigLoader.from_yaml_cases() + +async def run_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None: + client = server.get_async_client() + batch = await client.completions.create( + model=config.model, + prompt=config.prompts, + **config.api_keyword_args, + ) + choices: list[openai.types.CompletionChoice] = batch.choices + assert choices[0].text, "empty response" + print(choices) + + +async def run_image_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None: + from tools.send_mm_request import send_image_request + + send_image_request(config.model, server) + + +async def run_chat_completion_test(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None: + from tools.send_request import send_v1_chat_completions + + send_v1_chat_completions( + config.prompts[0], + model=config.model, + server=server, + request_args=config.api_keyword_args, + ) + + +def run_benchmark_comparisons(config: SingleNodeConfig, results: Any) -> None: + """General assertion engine for aisbench outcomes mapped directly from YAML.""" + + comparisons = config.extra_config.get("benchmark_comparisons_args", []) + + if not comparisons: + return + + # Valid task keys defined in benchmarks mapping + valid_keys = [k for k, v in config.benchmarks.items() if v] + + metrics_cache = {} + + for comp in comparisons: + metric = comp.get("metric", "TTFT") + baseline_key = comp.get("baseline") + target_key = comp.get("target") + ratio = comp.get("ratio", 1.0) + op = comp.get("operator", "<") + + if not baseline_key or not target_key: + logger.warning("Invalid comparison config: missing baseline or target. %s", comp) + continue + + if metric not in metrics_cache: + if metric == "TTFT": + from tools.aisbench import get_TTFT + + # map TTFT outputs directly to their corresponding benchmark test case names + metrics_cache[metric] = dict(zip(valid_keys, get_TTFT(results))) + else: + logger.warning("Unsupported metric for comparison: %s", metric) + continue + + metric_dict = metrics_cache[metric] + baseline_val = metric_dict.get(baseline_key) + target_val = metric_dict.get(target_key) + + if baseline_val is None or target_val is None: + logger.warning("Missing data to compare %s and %s in metrics: %s", baseline_key, target_key, metric_dict) + continue + + expected_threshold = baseline_val * ratio + + eval_str = f"metric {metric}: {target_key}({target_val}) {op} {baseline_key}({baseline_val}) * {ratio}" + + if op == "<": + assert target_val < expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]" + elif op == ">": + assert target_val > expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]" + elif op == "<=": + assert target_val <= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]" + elif op == ">=": + assert target_val >= expected_threshold, f"Assertion Failed: {eval_str} [threshold: {expected_threshold}]" + else: + logger.warning("Unsupported comparison operator: %s", op) + continue + + print(f"✅ Comparison passed: {eval_str} [threshold: {expected_threshold}]") + + +# Extend this dictionary to add new test capabilities +TEST_HANDLERS = { + "completion": run_completion_test, + "image": run_image_test, + "chat_completion": run_chat_completion_test, +} + + +async def _dispatch_tests(config: SingleNodeConfig, server: "RemoteOpenAIServer | DisaggEpdProxy") -> None: + """Dispatches requested tests defined in yaml.""" + for test_name in config.test_content: + if test_name == "benchmark_comparisons": + continue + + handler = TEST_HANDLERS.get(test_name) + if handler: + await handler(config, server) + else: + logger.warning("No handler registered for test content type: %s", test_name) + + +def _run_benchmarks(config: SingleNodeConfig, port: int) -> None: + """Run Aisbench benchmarks and process benchmark-dependent custom assertions.""" + aisbench_cases = [v for v in config.benchmarks.values() if v] + if not aisbench_cases: + return + + result = run_aisbench_cases( + model=config.model, + port=port, + aisbench_cases=aisbench_cases, + ) + + if "benchmark_comparisons" in config.test_content: + run_benchmark_comparisons(config, result) + +@pytest.mark.asyncio +@pytest.mark.parametrize("config", configs, ids=[config.name for config in configs]) +async def test_single_node(config: SingleNodeConfig) -> None: + if config.service_mode == "epd": + with ( + RemoteEPDServer(vllm_serve_args=config.epd_server_cmds, env_dict=config.envs) as _, + DisaggEpdProxy(proxy_args=config.epd_proxy_args, env_dict=config.envs) as proxy, + ): + await _dispatch_tests(config, proxy) + _run_benchmarks(config, proxy.port) + return + + # Standard OpenAI service mode + with RemoteOpenAIServer( + model=config.model, + vllm_serve_args=config.server_cmd, + server_port=config.server_port, + env_dict=config.envs, + auto_port=False, + ) as server: + await _dispatch_tests(config, server) + _run_benchmarks(config, config.server_port) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py deleted file mode 100644 index a16aca24990..00000000000 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-0528-W8A8", -] - -MODES = [ - "single", - "aclgraph", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 400, - "max_out_len": 1500, - "batch_size": 1000, - "baseline": 1, - "threshold": 0.97 -}] - - -def config(): - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True" - } - speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} - additional_config = {"enable_weight_nz_layout": True} - server_args = [ - "--quantization", "ascend", "--data-parallel-size", "2", - "--tensor-parallel-size", "8", "--enable-expert-parallel", "--port", - str(port), "--seed", "1024", "--max-model-len", "36864", - "--max-num-batched-tokens", "4096", "--max-num-seqs", "16", - "--trust-remote-code", "--gpu-memory-utilization", "0.9", - "--speculative-config", - json.dumps(speculative_config) - ] - return port, env_dict, additional_config, server_args - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: - port, env_dict, additional_config, server_args = config() - if mode == "single": - server_args.append("--enforce-eager") - server_args.extend(["--additional-config", json.dumps(additional_config)]) - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - if mode in ["single"]: - return - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py deleted file mode 100644 index 35c5b30f2df..00000000000 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_0528_w8a8_eplb.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases -from .test_deepseek_r1_0528_w8a8 import * - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_models_eplb(model: str) -> None: - port, env_dict, additional_config, server_args = config() - additional_config.update( - { - "eplb_config": { - "dynamic_eplb": "true", - "expert_heat_collection_interval": 1000, - "algorithm_execution_interval": 50, - "eplb_policy_type": 3, - } - } - ) - env_dict.update( - { - "DYNAMIC_EPLB": "true", - } - ) - server_args.extend(["--additional-config", json.dumps(additional_config)]) - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) - diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py b/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py deleted file mode 100644 index b3c62d11a76..00000000000 --- a/tests/e2e/nightly/single_node/models/test_deepseek_r1_w8a8_hbm.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-W8A8", -] - -MODES = [ - "single", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 6000, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 32, - "max_out_len": 1500, - "batch_size": 32, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: - port = get_open_port() - env_dict = { - "HCCL_BUFFSIZE": "1024", - } - - additional_config = { - "ascend_scheduler_config": { - "enabled": False - }, - "torchair_graph_config": { - "enabled": False, - "enable_multistream_shared_expert": False - } - } - - server_args = [ - "--quantization", "ascend", "--port", - str(port), "--data-parallel-size", "8", "--data-parallel-size-local", - "8", "--data-parallel-rpc-port", "13389", "--tensor-parallel-size", - "2", "--enable-expert-parallel", "--seed", "1024", "--max-num-seqs", - "32", "--max-model-len", "6000", "--max-num-batched-tokens", "6000", - "--trust-remote-code", "--gpu-memory-utilization", "0.92", - "--no-enable-prefix-caching", "--reasoning-parser", "deepseek_r1" - ] - - if mode == "single": - server_args.append("--enforce-eager") - - server_args.extend(["--additional-config", json.dumps(additional_config)]) - - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - if mode in ["single"]: - return - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py b/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py deleted file mode 100644 index c1cc4f71692..00000000000 --- a/tests/e2e/nightly/single_node/models/test_deepseek_v3_2_w8a8.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = ["vllm-ascend/DeepSeek-V3.2-W8A8"] - -TENSOR_PARALLELS = [8] -DATA_PARALLELS = [2] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 4096, - "batch_size": 8, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 1, - "max_out_len": 1500, - "batch_size": 1, - "request_rate": 11.2, - "baseline": 134, - "threshold": 0.97 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 100, - "max_out_len": 1500, - "batch_size": 4, - "request_rate": 11.2, - "baseline": 134, - "threshold": 0.97 -} -] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.mark.parametrize("dp_size", DATA_PARALLELS) -async def test_models(model: str, tp_size: int, dp_size: int) -> None: - port = get_open_port() - env_dict = { - "HCCL_OP_EXPANSION_MODE": "AIV", - "OMP_PROC_BIND": "false", - "OMP_NUM_THREADS": "1", - "HCCL_BUFFSIZE": "1024", - "VLLM_ASCEND_ENABLE_MLAPO": "1", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1", - "VLLM_ENGINE_READY_TIMEOUT_S": "1800" - } - - server_args = [ - "--enable-expert-parallel", "--tensor-parallel-size", - str(tp_size), "--data-parallel-size", - str(dp_size), "--port", - str(port), "--max-model-len", "8192", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "4", "--trust-remote-code", "--quantization", - "ascend", "--gpu-memory-utilization", "0.98", "--compilation-config", - '{"cudagraph_capture_sizes":[8, 16, 24, 32, 40, 48], "cudagraph_mode":"FULL_DECODE_ONLY"}', - "--speculative-config", - '{"num_speculative_tokens": 3, "method":"deepseek_mtp"}', - "--additional-config", - '{"layer_sharding": ["q_b_proj", "o_proj"]}', - "--reasoning-parser", "deepseek_v3", "--tokenizer_mode", "deepseek_v32" - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_glm4_5.py b/tests/e2e/nightly/single_node/models/test_glm4_5.py deleted file mode 100644 index 49809cfb7eb..00000000000 --- a/tests/e2e/nightly/single_node/models/test_glm4_5.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "ZhipuAI/GLM-4.5", -] - -TENSOR_PARALLELS = [8] -DATA_PARALLELS = [2] -FULL_GRAPH = [True, False] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 4096, - "batch_size": 8, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 16, - "max_out_len": 1500, - "batch_size": 8, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.mark.parametrize("dp_size", DATA_PARALLELS) -@pytest.mark.parametrize("full_graph", FULL_GRAPH) -async def test_models(model: str, tp_size: int, dp_size: int, - full_graph: bool) -> None: - port = get_open_port() - env_dict = {"HCCL_BUFFSIZE": "1024"} - server_args = [ - "--no-enable-prefix-caching", - "--enable-expert-parallel", - "--tensor-parallel-size", - str(tp_size), - "--data-parallel-size", - str(dp_size), - "--port", - str(port), - "--max-model-len", - "8192", - "--max-num-batched-tokens", - "8192", - "--block-size", - "16", - "--trust-remote-code", - "--gpu-memory-utilization", - "0.9", - ] - if full_graph: - server_args += [ - "--compilation-config", - '{"cudagraph_capture": [1,2,4,8,16], "cudagraph_model":"FULL_DECODE_ONLY"}' - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py b/tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py deleted file mode 100644 index a4d64f2b378..00000000000 --- a/tests/e2e/nightly/single_node/models/test_kimi_k2_thinking.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "moonshotai/Kimi-K2-Thinking", -] - -TENSOR_PARALLELS = [16] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 4096, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 512, - "max_out_len": 256, - "batch_size": 64, - "trust_remote_code": True, - "request_rate": 11.2, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "HCCL_BUFFSIZE": "1024", - "TASK_QUEUE_ENABLE": "1", - "OMP_PROC_BIND": "false", - "HCCL_OP_EXPANSION_MODE": "AIV", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True" - } - server_args = [ - "--tensor-parallel-size", - str(tp_size), - "--port", - str(port), - "--max-model-len", - "8192", - "--max-num-batched-tokens", - "8192", - "--max-num-seqs", - "12", - "--gpu-memory-utilization", - "0.9", - "--trust-remote-code", - "--enable-expert-parallel", - "--no-enable-prefix-caching", - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py deleted file mode 100644 index d473d7f9e86..00000000000 --- a/tests/e2e/nightly/single_node/models/test_mtpx_deepseek_r1_0528_w8a8.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-0528-W8A8", -] - -MODES = ["mtp2", "mtp3"] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_gsm8k = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}] - -aisbench_aime = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/aime2024", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 86.67, - "threshold": 7 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "100", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "VLLM_RPC_TIMEOUT": "3600000", - "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS": "3600000" - } - speculative_config = {"num_speculative_tokens": 2, "method": "mtp"} - compilation_config = { - "cudagraph_capture_sizes": [56], - "cudagraph_mode": "FULL_DECODE_ONLY" - } - server_args = [ - "--quantization", - "ascend", - "--seed", - "1024", - "--no-enable-prefix-caching", - "--data-parallel-size", - "2", - "--tensor-parallel-size", - "8", - "--enable-expert-parallel", - "--port", - str(port), - "--max-model-len", - "40960", - "--max-num-seqs", - "14", - "--trust-remote-code", - ] - if mode == "mtp2": - server_args.extend(["--max-num-batched-tokens", "4096"]) - server_args.extend( - ["--speculative-config", - json.dumps(speculative_config)]) - server_args.extend(["--gpu-memory-utilization", "0.92"]) - aisbench_cases = aisbench_gsm8k - if mode == "mtp3": - env_dict["HCCL_OP_EXPANSION_MODE"] = "AIV" - server_args.extend(["--max-num-batched-tokens", "2048"]) - speculative_config["num_speculative_tokens"] = 3 - server_args.extend( - ["--speculative-config", - json.dumps(speculative_config)]) - server_args.extend(["--gpu-memory-utilization", "0.9"]) - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) - aisbench_cases = aisbench_aime - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py b/tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py deleted file mode 100644 index f165403689c..00000000000 --- a/tests/e2e/nightly/single_node/models/test_prefix_cache_deepseek_r1_0528_w8a8.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json - -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import get_TTFT, run_aisbench_cases - -MODELS = [ - "vllm-ascend/DeepSeek-R1-0528-W8A8", -] - -aisbench_warm_up = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in1024-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 2, - "batch_size": 1000, - "baseline": 0, - "threshold": 0.97 -}] - -aisbench_cases0 = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/prefix0-in3500-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 1500, - "batch_size": 18, - "baseline": 1, - "threshold": 0.97 -}] - -aisbench_cases75 = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/prefix75-in3500-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 1500, - "batch_size": 18, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - } - additional_config = {"enable_weight_nz_layout": True} - speculative_config = {"num_speculative_tokens": 1, "method": "mtp"} - server_args = [ - "--quantization", "ascend", "--data-parallel-size", "2", - "--tensor-parallel-size", "8", "--enable-expert-parallel", "--port", - str(port), "--seed", "1024", "--max-model-len", "5200", - "--max-num-batched-tokens", "4096", "--max-num-seqs", "16", - "--trust-remote-code", "--gpu-memory-utilization", "0.9", - "--additional-config", - json.dumps(additional_config), "--speculative-config", - json.dumps(speculative_config) - ] - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False): - run_aisbench_cases(model, port, aisbench_warm_up) - result = run_aisbench_cases(model, port, aisbench_cases0) - TTFT0 = get_TTFT(result) - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False): - run_aisbench_cases(model, port, aisbench_warm_up) - result = run_aisbench_cases(model, port, aisbench_cases75) - TTFT75 = get_TTFT(result) - assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}." - print( - f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}." - ) diff --git a/tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py b/tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py deleted file mode 100644 index 8a1817bd227..00000000000 --- a/tests/e2e/nightly/single_node/models/test_prefix_cache_qwen3_32b_int8.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json - -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import get_TTFT, run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-32B-W8A8", -] - -aisbench_warm_up = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in1024-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 2, - "batch_size": 1000, - "baseline": 0, - "threshold": 0.97 -}] - -aisbench_cases0 = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/prefix0-in3500-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 1500, - "batch_size": 48, - "baseline": 1, - "threshold": 0.97 -}] - -aisbench_cases75 = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/prefix75-in3500-bs210", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 210, - "max_out_len": 1500, - "batch_size": 48, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = {"TASK_QUEUE_ENABLE": "1", "HCCL_OP_EXPANSION_MODE": "AIV"} - additional_config = {"enable_weight_nz_layout": True} - server_args = [ - "--quantization", "ascend", "--reasoning-parser", "qwen3", - "--tensor-parallel-size", "4", "--port", - str(port), "--max-model-len", "8192", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "256", "--trust-remote-code", - "--gpu-memory-utilization", "0.9", "--additional-config", - json.dumps(additional_config) - ] - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False): - run_aisbench_cases(model, port, aisbench_warm_up) - result = run_aisbench_cases(model, port, aisbench_cases0) - TTFT0 = get_TTFT(result) - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False): - run_aisbench_cases(model, port, aisbench_warm_up) - result = run_aisbench_cases(model, port, aisbench_cases75) - TTFT75 = get_TTFT(result) - assert TTFT75 < 0.8 * TTFT0, f"The TTFT for prefix75 {TTFT75} is not less than 0.8*TTFT for prefix0 {TTFT0}." - print( - f"The TTFT for prefix75 {TTFT75} is less than 0.8*TTFT for prefix0 {TTFT0}." - ) diff --git a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py b/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py deleted file mode 100644 index a8647716d98..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_32b.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases -from tools.send_mm_request import send_image_request - -MODELS = [ - "Qwen/Qwen2.5-VL-32B-Instruct", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/textvqa-lite", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "max_out_len": 2048, - "batch_size": 128, - "baseline": 76.22, - "temperature": 0, - "top_k": -1, - "top_p": 1, - "repetition_penalty": 1, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/textvqa-perf-1080p", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "num_prompts": 512, - "max_out_len": 256, - "batch_size": 128, - "temperature": 0, - "top_k": -1, - "top_p": 1, - "repetition_penalty": 1, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "TASK_QUEUE_ENABLE": "1", - "VLLM_ASCEND_ENABLE_NZ": "0", - "HCCL_OP_EXPANSION_MODE": "AIV" - } - server_args = [ - "--no-enable-prefix-caching", "--mm-processor-cache-gb", "0", - "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "30000", "--max-num-batched-tokens", - "40000", "--max-num-seqs", "400", "--trust-remote-code", - "--gpu-memory-utilization", "0.8", "--compilation_config", - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - send_image_request(model, server) - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py b/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py deleted file mode 100644 index 52478b385af..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases -from tools.send_mm_request import send_image_request - -MODELS = [ - "Qwen/Qwen2.5-VL-7B-Instruct", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/textvqa-lite", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "max_out_len": 2048, - "batch_size": 128, - "baseline": 82.05, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/textvqa-perf-1080p", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "num_prompts": 512, - "max_out_len": 256, - "batch_size": 128, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "TASK_QUEUE_ENABLE": "1", - "VLLM_ASCEND_ENABLE_NZ": "0", - "HCCL_OP_EXPANSION_MODE": "AIV" - } - server_args = [ - "--no-enable-prefix-caching", "--mm-processor-cache-gb", "0", - "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "30000", "--max-num-batched-tokens", - "40000", "--max-num-seqs", "400", "--trust-remote-code", - "--gpu-memory-utilization", "0.8", "--compilation_config", - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - send_image_request(model, server) - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py b/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py deleted file mode 100644 index fee7a705868..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen2_5_vl_7b_epd.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import DisaggEpdProxy, RemoteEPDServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "Qwen/Qwen2.5-VL-7B-Instruct", -] -SHARED_STORAGE_PATH = "/dev/shm/epd/storage" -TENSOR_PARALLELS = [1] - -warmup_cases = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/textvqa-perf-1080p", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "num_prompts": 50, - "max_out_len": 20, - "batch_size": 32, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/textvqa-lite", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "max_out_len": 2048, - "batch_size": 128, - "baseline": 82.05, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/textvqa-perf-1080p", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "textvqa/textvqa_gen_base64", - "num_prompts": 512, - "max_out_len": 256, - "batch_size": 128, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - encode_port = get_open_port() - pd_port = get_open_port() - vllm_server_args = [ - [ - "--port", - str(encode_port), "--model", model, "--gpu-memory-utilization", - "0.01", "--tensor-parallel-size", - str(tp_size), "--enforce-eager", "--no-enable-prefix-caching", - "--max-model-len", "10000", "--max-num-batched-tokens", "10000", - "--max-num-seqs", "1", "--ec-transfer-config", - '{"ec_connector_extra_config":{"shared_storage_path":"' + - SHARED_STORAGE_PATH + - '"},"ec_connector":"ECExampleConnector","ec_role": "ec_producer"}' - ], - [ - "--port", - str(pd_port), "--model", model, "--gpu-memory-utilization", "0.95", - "--tensor-parallel-size", - str(tp_size), "--enforce-eager", "--max-model-len", "10000", - "--max-num-batched-tokens", "10000", "--max-num-seqs", "128", - "--ec-transfer-config", - '{"ec_connector_extra_config":{"shared_storage_path":"' + - SHARED_STORAGE_PATH + - '"},"ec_connector":"ECExampleConnector","ec_role": "ec_consumer"}' - ] - ] - proxy_port = get_open_port() - proxy_args = [ - "--host", "127.0.0.1", "--port", - str(proxy_port), "--encode-servers-urls", - f"http://localhost:{encode_port}", "--decode-servers-urls", - f"http://localhost:{pd_port}", "--prefill-servers-urls", "disable" - ] - - with RemoteEPDServer(vllm_serve_args=vllm_server_args) as _: - with DisaggEpdProxy(proxy_args=proxy_args) as _: - # warm up - run_aisbench_cases(model=model, - port=proxy_port, - aisbench_cases=warmup_cases) - # aisbench test - run_aisbench_cases(model, proxy_port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py deleted file mode 100644 index 2d54c27523c..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_a22b_w8a8_eplb.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases -from .test_qwen3_235b_w8a8 import * - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_models_eplb(model: str) -> None: - port, aisbench_cases, env_dict, compilation_config, server_args = config() - env_dict.update( - { - "DYNAMIC_EPLB": "true", - } - ) - additional_config: dict[str, Any] = {} - additional_config["eplb_config"] = { - "dynamic_eplb": "true", - "expert_heat_collection_interval": 600, - "algorithm_execution_interval": 50, - "num_redundant_experts": 16, - "eplb_policy_type": 2, - } - server_args.extend(["--additional-config", json.dumps(additional_config)]) - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py b/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py deleted file mode 100644 index c468a8515ae..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_235b_w8a8.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-235B-A22B-W8A8", -] - -MODES = ["full_graph", "piecewise"] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -def config(): - port = get_open_port() - aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 - }] - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - "VLLM_ASCEND_ENABLE_FLASHCOMM1": "1" - } - compilation_config = {"cudagraph_mode": "FULL_DECODE_ONLY"} - server_args = [ - "--quantization", "ascend", "--async-scheduling", - "--data-parallel-size", "4", "--tensor-parallel-size", "4", - "--enable-expert-parallel", "--port", - str(port), "--max-model-len", "40960", "--max-num-batched-tokens", - "8192", "--max-num-seqs", "12", "--trust-remote-code", - "--gpu-memory-utilization", "0.9" - ] - return port, aisbench_cases, env_dict, compilation_config, server_args - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -async def test_models(model: str, mode: str) -> None: - port, aisbench_cases, env_dict, compilation_config, server_args = config() - if mode == "piecewise": - compilation_config["cudagraph_mode"] = "PIECEWISE" - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py b/tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py deleted file mode 100644 index 491b3582e94..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_30b_w8a8.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-30B-A3B-W8A8", -] - -TENSOR_PARALLELS = [1] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 180, - "max_out_len": 1500, - "batch_size": 45, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "OMP_PROC_BIND": "false", - "OMP_NUM_THREADS": "10", - "HCCL_BUFFSIZE": "1024", - "HCCL_OP_EXPANSION_MODE": "AIV", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True" - } - server_args = [ - "--quantization", "ascend", "--async-scheduling", - "--no-enable-prefix-caching", "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "5600", "--max-num-batched-tokens", - "16384", "--max-num-seqs", "100", "--trust-remote-code", - "--gpu-memory-utilization", "0.9", "--compilation-config", - '{"cudagraph_mode": "FULL_DECODE_ONLY"}' - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_32b.py b/tests/e2e/nightly/single_node/models/test_qwen3_32b.py deleted file mode 100644 index 9a358e64e3b..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_32b.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "Qwen/Qwen3-32B", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 80, - "max_out_len": 1500, - "batch_size": 20, - "request_rate": 0, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "TASK_QUEUE_ENABLE": "1", - "OMP_PROC_BIND": "false", - "HCCL_OP_EXPANSION_MODE": "AIV", - "PAGED_ATTENTION_MASK_LEN": "5500" - } - server_args = [ - "--no-enable-prefix-caching", "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "36864", "--max-num-batched-tokens", - "36864", "--block-size", "128", "--trust-remote-code", - "--gpu-memory-utilization", "0.9", "--additional-config", - '{"enable_weight_nz_layout":true}' - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py b/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py deleted file mode 100644 index f2507028bc4..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -import json -import os -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-32B-W8A8", -] - -MODES = [ - "aclgraph", - "single", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -batch_size_dict = { - "linux-aarch64-a2b3-4": 72, - "linux-aarch64-a3-4": 76, -} -VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4") -performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1) - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/aime2024", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "aime2024/aime2024_gen_0_shot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 83.33, - "threshold": 7 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 4 * performance_batch_size, - "max_out_len": 1500, - "batch_size": performance_batch_size, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, mode: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "TASK_QUEUE_ENABLE": "1", - "HCCL_OP_EXPANSION_MODE": "AIV", - "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", - } - compilation_config = { - "cudagraph_mode": - "FULL_DECODE_ONLY", - "cudagraph_capture_sizes": - [1, 12, 16, 20, 24, 32, 48, 60, 64, 68, 72, 76, 80] - } - server_args = [ - "--quantization", "ascend", "--no-enable-prefix-caching", - "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "40960", "--max-num-batched-tokens", - "40960", "--block-size", "128", "--trust-remote-code", - "--reasoning-parser", "qwen3", "--gpu-memory-utilization", "0.9", - "--async-scheduling", "--additional-config", - '{"weight_prefetch_config":{"enabled":true}}', - ] - if mode == "single": - server_args.append("--enforce-eager") - if mode == "aclgraph": - server_args.extend( - ["--compilation-config", - json.dumps(compilation_config)]) - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - if mode == "single": - return - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py b/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py deleted file mode 100644 index 3410f31cb3e..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_32b_int8_a3_feature_stack3.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# - -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases -from tools.send_request import send_v1_chat_completions - -MODELS = [ - "vllm-ascend/Qwen3-32B-W8A8", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "9.11 and 9.8, which is greater?", -] - -api_keyword_args = { - "chat_template_kwargs": { - "enable_thinking": True - }, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_noncot_chat_prompt", - "max_out_len": 10240, - "batch_size": 32, - "baseline": 96, - "threshold": 4 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 240, - "max_out_len": 1500, - "batch_size": 60, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "VLLM_USE": "1", - "TASK_QUEUE_ENABLE": "1", - "HCCL_OP_EXPANSION_MODE": "AIV", - "OMP_PROC_BIND": "false", - "VLLM_ASCEND_ENABLE_TOPK_OPTIMIZE": "1", - "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", - } - server_args = [ - "--quantization", "ascend", "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--trust-remote-code", "--reasoning-parser", "qwen3", - "--distributed_executor_backend", "mp", "--gpu-memory-utilization", - "0.9", "--block-size", "128", "--max-num-seqs", "256", - "--enforce-eager", "--max-model-len", "35840", - "--max-num-batched-tokens", "35840", "--additional-config", - '{"enable_weight_nz_layout":true, "weight_prefetch_config":{"enabled": true}}', - "--compilation-config", - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,8,24,48,60]}' - ] - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - send_v1_chat_completions(prompts[0], - model, - server, - request_args=api_keyword_args) - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_next.py b/tests/e2e/nightly/single_node/models/test_qwen3_next.py deleted file mode 100644 index c05fab6a923..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_next.py +++ /dev/null @@ -1,111 +0,0 @@ -import json -import os -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "Qwen/Qwen3-Next-80B-A3B-Instruct", -] - -MODES = ["aclgraph"] - -TENSOR_PARALLELS = [4] -MAX_NUM_BATCHED_TOKENS = [8192, 32768] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -batch_size_dict = { - "linux-aarch64-a2b3-4": 64, - "linux-aarch64-a3-4": 64, -} -VLLM_CI_RUNNER = os.getenv("VLLM_CI_RUNNER", "linux-aarch64-a2b3-4") -performance_batch_size = batch_size_dict.get(VLLM_CI_RUNNER, 1) - -aisbench_cases = [{ - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 4 * performance_batch_size, - "max_out_len": 1500, - "batch_size": performance_batch_size, - "baseline": 1, - "threshold": 0.97 -}, { - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "top_k": 20, - "baseline": 95, - "threshold": 5 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -@pytest.mark.parametrize("max_num_batched_tokens", MAX_NUM_BATCHED_TOKENS) -async def test_models(model: str, mode: str, tp_size: int, - max_num_batched_tokens: int) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", - } - server_args = [ - "--tensor-parallel-size", - str(tp_size), - "--port", - str(port), - "--max-model-len", - "40960", - "--max-num-batched-tokens", - str(max_num_batched_tokens), - "--trust-remote-code", - "--async-scheduling", - "--no-enable-prefix-caching", - "--enable-expert-parallel", - "--gpu-memory-utilization", - "0.8", - "--max-num-seqs", - "64", - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - if mode == "single": - return - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py b/tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py deleted file mode 100644 index a08e3fbd3ec..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwen3_next_w8a8.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "vllm-ascend/Qwen3-Next-80B-A3B-Instruct-W8A8", -] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -async def test_models(model: str) -> None: - port = get_open_port() - env_dict = { - "OMP_NUM_THREADS": "10", - "OMP_PROC_BIND": "false", - "HCCL_BUFFSIZE": "1024", - } - server_args = [ - "--quantization", - "ascend", - "--async-scheduling", - "--no-enable-prefix-caching", - "--data-parallel-size", - "1", - "--tensor-parallel-size", - "4", - "--enable-expert-parallel", - "--port", - str(port), - "--max-model-len", - "40960", - "--max-num-batched-tokens", - "8192", - "--max-num-seqs", - "32", - "--trust-remote-code", - "--gpu-memory-utilization", - "0.65", - "--compilation-config", - '{"cudagraph_capture_sizes": [32]}', - ] - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - print(choices) - # aisbench test - run_aisbench_cases(model, - port, - aisbench_cases, - server_args=server_args) diff --git a/tests/e2e/nightly/single_node/models/test_qwq_32b.py b/tests/e2e/nightly/single_node/models/test_qwq_32b.py deleted file mode 100644 index b5bcb89b5a6..00000000000 --- a/tests/e2e/nightly/single_node/models/test_qwq_32b.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# Copyright 2023 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# This file is a part of the vllm-ascend project. -# -from typing import Any - -import openai -import pytest -from vllm.utils.network_utils import get_open_port - -from tests.e2e.conftest import RemoteOpenAIServer -from tools.aisbench import run_aisbench_cases - -MODELS = [ - "Qwen/QwQ-32B", -] - -MODES = [ - "aclgraph", - "single", -] - -TENSOR_PARALLELS = [4] - -prompts = [ - "San Francisco is a", -] - -api_keyword_args = { - "max_tokens": 10, -} - -aisbench_cases = [{ - "case_type": "accuracy", - "dataset_path": "vllm-ascend/gsm8k-lite", - "request_conf": "vllm_api_general_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_chat_prompt", - "max_out_len": 32768, - "batch_size": 32, - "baseline": 95, - "threshold": 5 -}, { - "case_type": "performance", - "dataset_path": "vllm-ascend/GSM8K-in3500-bs400", - "request_conf": "vllm_api_stream_chat", - "dataset_conf": "gsm8k/gsm8k_gen_0_shot_cot_str_perf", - "num_prompts": 240, - "max_out_len": 1500, - "batch_size": 60, - "baseline": 1, - "threshold": 0.97 -}] - - -@pytest.mark.asyncio -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("mode", MODES) -@pytest.mark.parametrize("tp_size", TENSOR_PARALLELS) -async def test_models(model: str, mode: str, tp_size: int) -> None: - port = get_open_port() - env_dict = { - "TASK_QUEUE_ENABLE": "1", - "OMP_PROC_BIND": "false", - "HCCL_OP_EXPANSION_MODE": "AIV", - "VLLM_ASCEND_ENABLE_FLASHCOMM": "1", - "VLLM_ASCEND_ENABLE_DEBSE_OPTIMIZE": "1" - } - server_args = [ - "--tensor-parallel-size", - str(tp_size), "--port", - str(port), "--max-model-len", "36864", "--max-num-batched-tokens", - "36864", "--block-size", "128", "--trust-remote-code", - "--gpu-memory-utilization", "0.9", "--compilation_config", - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}', - "--reasoning-parser", "deepseek_r1", "--distributed_executor_backend", - "mp", "--additional-config", '{"weight_prefetch_config":{"enabled":true}}' - ] - if mode == "single": - server_args.remove("--compilation_config") - server_args.remove( - '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes": [1, 8, 24, 48, 60]}' - ) - server_args.append("--enforce-eager") - request_keyword_args: dict[str, Any] = { - **api_keyword_args, - } - with RemoteOpenAIServer(model, - server_args, - server_port=port, - env_dict=env_dict, - auto_port=False) as server: - client = server.get_async_client() - batch = await client.completions.create( - model=model, - prompt=prompts, - **request_keyword_args, - ) - choices: list[openai.types.CompletionChoice] = batch.choices - assert choices[0].text, "empty response" - if mode == "single": - return - # aisbench test - run_aisbench_cases(model, port, aisbench_cases) diff --git a/tests/e2e/nightly/single_node/models/__init__.py b/tests/e2e/nightly/single_node/ops/multicard_ops_a2/__init__.py similarity index 100% rename from tests/e2e/nightly/single_node/models/__init__.py rename to tests/e2e/nightly/single_node/ops/multicard_ops_a2/__init__.py diff --git a/tools/aisbench.py b/tools/aisbench.py index dc22dded255..b04d731efcf 100644 --- a/tools/aisbench.py +++ b/tools/aisbench.py @@ -245,9 +245,11 @@ def run_aisbench_cases(model, port, aisbench_cases, server_args="", host_ip="loc return aisbench_results -def get_TTFT(result): - TTFT = result[0][0].loc["TTFT", "Average"][:-3] - return float(TTFT) +def get_TTFT(results): + TTFT = [] + for i in range(len(results)): + TTFT.append(float(results[i][0].loc["TTFT", "Average"][:-3])) + return TTFT temp_dir = tempfile.gettempdir()