Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ jobs:
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
pytest -sv --durations=0 tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Kimi_K2_Thinking_W4A16
pytest -sv --durations=0 tests/e2e/multicard/test_data_parallel_tp2.py
pytest -sv --durations=0 tests/e2e/multicard/long_sequence/test_basic.py

- name: Install Ascend toolkit & triton_ascend (for Qwen3-Next-80B-A3B-Instruct)
shell: bash -l {0}
Expand Down
141 changes: 141 additions & 0 deletions tests/e2e/multicard/long_sequence/test_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
"""Compare the short outputs of HF and vLLM when using greedy sampling.

Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
"""
Comment on lines +19 to +22
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The file's docstring contains an incorrect command to run the tests. It refers to test_qwen3_moe.py instead of the current file, test_long_sequence_basic.py. This is likely a copy-paste error and can be confusing for other developers.

Suggested change
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/e2e/multicard/test_qwen3_moe.py`.
"""
"""Compare the short outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/e2e/multicard/long_sequence/test_long_sequence_basic.py`.
"""


import os

import pytest
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import vllm_version_is

os.environ["HCCL_BUFFSIZE"] = "768"


@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_basic():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=True,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)

model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
with VllmRunner(
model,
enforce_eager=True,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
enable_expert_parallel=True,
block_size=128,
quantization="ascend",
) as runner:
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_full_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128,
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
runner.model.generate(prompts, sampling_params)

model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
enable_expert_parallel=True,
block_size=128,
quantization="ascend",
compilation_config={
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 24, 48, 60]
}) as runner:
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is('0.12.0'),
reason="0.12.0 is not supported for context sequence.")
def test_pcp_dcp_piece_wise():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
"The president of United States is", "AI future is"
]
model = "deepseek-ai/DeepSeek-V2-Lite-Chat"
sampling_params = SamplingParams(max_tokens=32, temperature=0.0)
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=2,
max_num_batched_tokens=1024,
enable_expert_parallel=True,
block_size=128) as runner:
runner.model.generate(prompts, sampling_params)

model = "vllm-ascend/Qwen3-30B-A3B-W8A8"
with VllmRunner(model,
enforce_eager=False,
max_model_len=1024,
tensor_parallel_size=2,
prefill_context_parallel_size=2,
decode_context_parallel_size=1,
enable_expert_parallel=True,
block_size=128,
quantization="ascend") as runner:
runner.model.generate(prompts, sampling_params)
Comment on lines +37 to +141
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The three test functions (test_pcp_dcp_basic, test_pcp_dcp_full_graph, test_pcp_dcp_piece_wise) are highly repetitive, making the code difficult to maintain. Key components like prompts, sampling_params, and the VllmRunner configuration are duplicated in each function.

A better approach is to refactor this using pytest.mark.parametrize. This will create a single, parameterized test, eliminating redundancy and making the test configurations explicit and easier to manage. The suggested code implements this refactoring. Please note that import pytest is included in the suggestion and should be moved to the top of the file.

import pytest


PROMPTS = [
    "The capital of France is",
    "Hello, my name is Tom, I am",
    "The president of United States is",
    "AI future is"
]
SAMPLING_PARAMS = SamplingParams(max_tokens=32, temperature=0.0)
DEEPSEEK_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
QWEN_MODEL = "vllm-ascend/Qwen3-30B-A3B-W8A8"

BASE_DEEPSEEK_ARGS = {
    "max_model_len": 1024,
    "tensor_parallel_size": 2,
    "prefill_context_parallel_size": 2,
    "decode_context_parallel_size": 2,
    "max_num_batched_tokens": 1024,
    "enable_expert_parallel": True,
    "block_size": 128
}
BASE_QWEN_ARGS = {
    "max_model_len": 1024,
    "tensor_parallel_size": 8,
    "prefill_context_parallel_size": 2,
    "decode_context_parallel_size": 2,
    "enable_expert_parallel": True,
    "block_size": 128,
    "quantization": "ascend",
}

def _run_models(deepseek_vllm_runner_args, qwen_vllm_runner_args):
    with VllmRunner(DEEPSEEK_MODEL, **deepseek_vllm_runner_args) as runner:
        runner.model.generate(PROMPTS, SAMPLING_PARAMS)
    
    with VllmRunner(QWEN_MODEL, **qwen_vllm_runner_args) as runner:
        runner.model.generate(PROMPTS, SAMPLING_PARAMS)

@pytest.mark.parametrize("extra_args", [
    {"enforce_eager": True},
    {
        "enforce_eager": False,
        "compilation_config": {
            "cudagraph_mode": "FULL_DECODE_ONLY",
            "cudagraph_capture_sizes": [4, 8, 24, 48, 60]
        },
    },
    {"enforce_eager": False},
], ids=["basic", "full_graph", "piece_wise"])
def test_pcp_dcp(extra_args):
    deepseek_args = {**BASE_DEEPSEEK_ARGS, **extra_args}
    qwen_args = {**BASE_QWEN_ARGS, **extra_args}
    _run_models(deepseek_args, qwen_args)

Loading