diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index ee2ddad6f63..22a655c5ff5 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -68,15 +68,6 @@ jobs: pip install -r requirements-dev.txt pip install -v -e . - - name: Run vllm-project/vllm-ascend test (non triton) - env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256 - if: ${{ inputs.type == 'full' }} - run: | - pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py - pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py - - name: Install Ascend toolkit & triton_ascend shell: bash -l {0} run: | @@ -94,6 +85,8 @@ jobs: run: | # pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_accuracy.py # pytest -sv --durations=0 tests/e2e/singlecard/test_quantization.py + pytest -sv --durations=0 tests/e2e/singlecard/test_aclgraph_mem.py + pytest -sv --durations=0 tests/e2e/singlecard/test_camem.py pytest -sv --durations=0 tests/e2e/singlecard/test_vlm.py::test_multimodal_vl pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py::test_qwen_pooling_classify_correctness diff --git a/vllm_ascend/ops/triton/activation/swiglu_quant.py b/vllm_ascend/ops/triton/activation/swiglu_quant.py index d857b3d474c..7ec2cbaf36a 100644 --- a/vllm_ascend/ops/triton/activation/swiglu_quant.py +++ b/vllm_ascend/ops/triton/activation/swiglu_quant.py @@ -1,8 +1,5 @@ import torch -from vllm.triton_utils import HAS_TRITON, tl, triton - -if HAS_TRITON: - import torch_npu._inductor # noqa: F401 +from vllm.triton_utils import tl, triton from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num diff --git a/vllm_ascend/ops/triton/fla/fused_qkvzba_split_reshape.py b/vllm_ascend/ops/triton/fla/fused_qkvzba_split_reshape.py index 4129fdd0f21..d809dcd46dc 100644 --- a/vllm_ascend/ops/triton/fla/fused_qkvzba_split_reshape.py +++ b/vllm_ascend/ops/triton/fla/fused_qkvzba_split_reshape.py @@ -10,10 +10,7 @@ # ruff: noqa: E501 # mypy: ignore-errors import torch -from vllm.triton_utils import HAS_TRITON, tl, triton - -if HAS_TRITON: - import torch_npu._inductor # noqa: F401 +from vllm.triton_utils import tl, triton @triton.jit diff --git a/vllm_ascend/ops/triton/rope.py b/vllm_ascend/ops/triton/rope.py index a3856ca3687..3700e329130 100644 --- a/vllm_ascend/ops/triton/rope.py +++ b/vllm_ascend/ops/triton/rope.py @@ -14,10 +14,7 @@ # limitations under the License. # This file is a part of the vllm-ascend project. # -from vllm.triton_utils import HAS_TRITON, tl, triton - -if HAS_TRITON: - import torch_npu._inductor # noqa: F401 +from vllm.triton_utils import tl, triton from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 303dae362e1..31053450a70 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -88,6 +88,11 @@ def __init__( # register patch for vllm from vllm_ascend.utils import adapt_patch adapt_patch() + # Import _inductor for graph mode execution with triton + # This lazy import avoids torch_npu re-initialization in patch + from vllm.triton_utils import HAS_TRITON + if HAS_TRITON: + import torch_npu._inductor # noqa: F401 # Register ops when worker init. from vllm_ascend import ops ops.register_dummy_fusion_op()