From 8482bbdac2352b7fa9a8594504faf2f4767d8163 Mon Sep 17 00:00:00 2001 From: hfadzxy Date: Wed, 7 Jan 2026 15:50:23 +0800 Subject: [PATCH] [Main2Main] Upgrade vllm commit to 0109 Signed-off-by: hfadzxy --- .github/workflows/_e2e_test.yaml | 4 +-- .github/workflows/bot_pr_create.yaml | 2 +- .github/workflows/pr_test_full.yaml | 2 +- .github/workflows/pr_test_light.yaml | 6 ++-- .../workflows/schedule_codecov_refresh.yaml | 2 +- docs/source/community/versioning_policy.md | 2 +- .../compile/test_norm_quant_fusion.py | 19 +++++----- tests/ut/attention/test_attention_cp.py | 5 +++ tests/ut/attention/test_attention_v1.py | 35 +++++++++++++++++++ tests/ut/attention/test_mla_v1.py | 17 +++++++++ tests/ut/attention/test_sfa_v1.py | 22 ++++++++++++ tests/ut/ops/test_activation.py | 19 ++++++++-- tests/ut/ops/test_layernorm.py | 14 ++++++-- tests/ut/ops/test_rotary_embedding.py | 18 +++++++++- tests/ut/ops/test_token_dispatcher.py | 17 +++++++++ tests/ut/ops/test_vocab_parallel_embedding.py | 9 +++++ tests/ut/worker/test_worker_v1.py | 19 ++++++---- vllm_ascend/attention/mla_v1.py | 8 +++-- vllm_ascend/ops/triton/mamba/causal_conv1d.py | 8 ++++- vllm_ascend/worker/model_runner_v1.py | 4 ++- vllm_ascend/worker/worker.py | 9 ++--- 21 files changed, 203 insertions(+), 38 deletions(-) diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 79c20073858..3a7acc1cc37 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -118,7 +118,7 @@ jobs: pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py # model_runner_v2 - pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py + # pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py # pooling pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py @@ -309,7 +309,7 @@ jobs: run: | pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_data_parallel_tp2.py pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_kimi_k2.py - pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py + pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py # long_sequence pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 7b8b1cf1610..f5775be4525 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=2f4e6548efec402b913ffddc8726230d9311948d + VLLM_COMMIT=bde38c11df0ea066a740efe9b77fff5418be45df echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV - name: Checkout repository diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index a96fe250790..9468346e9e6 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] + vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 77c3ef46018..147967ebb95 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 2f4e6548efec402b913ffddc8726230d9311948d + vllm: bde38c11df0ea066a740efe9b77fff5418be45df changes: runs-on: linux-aarch64-a2-0 outputs: @@ -81,7 +81,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] + vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -93,7 +93,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0] + vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index ae9000cb7b5..371e2ec9fa7 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d] + vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index d2530983943..1ecf381653f 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/ For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly. | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | +| main | bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 | ## Release cadence diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py index 1a335135ec9..057fe888e39 100644 --- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py +++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py @@ -305,15 +305,16 @@ def test_rmsnorm_quant_fusion( vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype)) - update_environment_variables({ - "RANK": "0", - "LOCAL_RANK": "0", - "WORLD_SIZE": "1", - "MASTER_ADDR": "localhost", - "MASTER_PORT": "12345", - }) - init_distributed_environment() - ensure_model_parallel_initialized(1, 1) + with vllm.config.set_current_vllm_config(vllm_config): + update_environment_variables({ + "RANK": "0", + "LOCAL_RANK": "0", + "WORLD_SIZE": "1", + "MASTER_ADDR": "localhost", + "MASTER_PORT": "12345", + }) + init_distributed_environment() + ensure_model_parallel_initialized(1, 1) with vllm.config.set_current_vllm_config(vllm_config): with set_ascend_forward_context(None, vllm_config): diff --git a/tests/ut/attention/test_attention_cp.py b/tests/ut/attention/test_attention_cp.py index cc518fdab53..487d416978c 100644 --- a/tests/ut/attention/test_attention_cp.py +++ b/tests/ut/attention/test_attention_cp.py @@ -33,6 +33,11 @@ def setUp(self): self.layer_no_quant.layer_name = "test_layer" self.layer_no_quant._k_scale_float = 1.0 self.layer_no_quant._v_scale_float = 1.0 + self.mock_vllm_config = MagicMock() + self.config_patcher = patch( + 'vllm_ascend.attention.attention_v1.get_current_vllm_config', + return_value=self.mock_vllm_config) + self.config_patcher.start() self.impl = AscendAttentionCPImpl( num_heads=8, diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 0dbd5837f93..d57f4ef2a52 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -13,6 +13,23 @@ class TestAscendAttentionBackend(TestBase): + def setUp(self): + self.mock_config = MagicMock() + + mock_parallel_config = MagicMock() + mock_parallel_config.prefill_context_parallel_size = 1 + mock_parallel_config.decode_context_parallel_size = 1 + + self.mock_config.parallel_config = mock_parallel_config + + self.utils_patcher = patch( + 'vllm_ascend.attention.utils.get_current_vllm_config', + return_value=self.mock_config) + self.utils_patcher.start() + + from vllm_ascend.attention.utils import enable_cp + enable_cp.cache_clear() + def test_get_name(self): self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM") @@ -102,6 +119,19 @@ def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata): class TestAscendAttentionBackendImpl(TestBase): def setUp(self): + self.mock_event = MagicMock() + self.mock_event.record.return_value = None + self.mock_event.wait.return_value = None + + self.mock_stream = MagicMock() + self.event_patcher = patch('torch_npu.npu.Event', + return_value=self.mock_event) + self.stream_patcher = patch('torch_npu.npu.current_stream', + return_value=self.mock_stream) + + self.event_patcher.start() + self.stream_patcher.start() + self.layer = MagicMock() self.layer.layer_name = "test_layer" self.layer._k_scale_float = 1.0 @@ -119,6 +149,11 @@ def setUp(self): self.layer_no_quant.layer_name = "test_layer" self.layer_no_quant._k_scale_float = 1.0 self.layer_no_quant._v_scale_float = 1.0 + self.mock_vllm_config = MagicMock() + self.config_patcher = patch( + 'vllm_ascend.attention.attention_v1.get_current_vllm_config', + return_value=self.mock_vllm_config) + self.config_patcher.start() self.impl = AscendAttentionBackendImpl( num_heads=8, diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index 46a58626753..6d25fbba765 100755 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -22,6 +22,23 @@ class TestAscendMLABackend(TestBase): + def setUp(self): + self.mock_config = MagicMock() + + mock_parallel_config = MagicMock() + mock_parallel_config.prefill_context_parallel_size = 1 + mock_parallel_config.decode_context_parallel_size = 1 + + self.mock_config.parallel_config = mock_parallel_config + + self.utils_patcher = patch( + 'vllm_ascend.attention.utils.get_current_vllm_config', + return_value=self.mock_config) + self.utils_patcher.start() + + from vllm_ascend.attention.utils import enable_cp + enable_cp.cache_clear() + def test_get_name(self): self.assertEqual(AscendMLABackend.get_name(), "ASCEND_MLA") diff --git a/tests/ut/attention/test_sfa_v1.py b/tests/ut/attention/test_sfa_v1.py index 43023b6bb4e..2fdddf12192 100644 --- a/tests/ut/attention/test_sfa_v1.py +++ b/tests/ut/attention/test_sfa_v1.py @@ -12,6 +12,7 @@ from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder) +from vllm_ascend.utils import enable_dsa_cp class TestAscendSFABackend(TestBase): @@ -83,6 +84,27 @@ def test_ascend_sfa_metadata_default(self): class TestAscendSFAMetadataBuilder(TestBase): + def setUp(self): + self.mock_cfg = MagicMock() + + self.mock_cfg.parallel_config = MagicMock() + self.mock_cfg.parallel_config.tensor_parallel_size = 1 + self.mock_cfg.parallel_config.prefill_context_parallel_size = 1 + self.mock_cfg.parallel_config.decode_context_parallel_size = 1 + + self.mock_cfg.compilation_config = MagicMock() + self.mock_cfg.compilation_config.pass_config = MagicMock() + self.mock_cfg.compilation_config.pass_config.enable_sp = False + + self.mock_cfg.speculative_config.num_speculative_tokens = 0 + + self.patcher = patch("vllm.config.get_current_vllm_config", + return_value=self.mock_cfg) + self.patcher.start() + + if hasattr(enable_dsa_cp, "cache_clear"): + enable_dsa_cp.cache_clear() + def test_ascend_sfa_metadata_builder_default(self): kv_cache_spec = MagicMock() layer_names = ["layer1", "layer2"] diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py index 9b80236570a..bf03aa5c49e 100644 --- a/tests/ut/ops/test_activation.py +++ b/tests/ut/ops/test_activation.py @@ -13,10 +13,11 @@ # This file is a part of the vllm-ascend project. # -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest import torch +from vllm.config import set_current_vllm_config from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm_ascend.utils import AscendDeviceType @@ -27,8 +28,20 @@ def dummy_tensor(): return torch.randn(4, 8, dtype=torch.float16) +@pytest.fixture +def default_vllm_config(): + mock_config = MagicMock() + + mock_config.compilation_config.dispatch_forward_backend = "eager" + + mock_config.compilation_config.custom_ops = ["all"] + + with set_current_vllm_config(mock_config): + yield mock_config + + @patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1) -def test_QuickGELU_forward(mock_gelu, dummy_tensor): +def test_QuickGELU_forward(mock_gelu, dummy_tensor, default_vllm_config): layer = QuickGELU() out = layer.forward(dummy_tensor) @@ -45,7 +58,7 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor): side_effect=lambda x: None) def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj, mock_maybe_wait_prefetch_done, mock_swiglu, - is_310p, dummy_tensor): + is_310p, dummy_tensor, default_vllm_config): with patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py index 03befc7e851..ce31f9785b3 100644 --- a/tests/ut/ops/test_layernorm.py +++ b/tests/ut/ops/test_layernorm.py @@ -1,7 +1,8 @@ -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest import torch +from vllm.config import set_current_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm from vllm_ascend.utils import AscendDeviceType @@ -20,13 +21,22 @@ def mock_add_rms_norm(x, residual, weight, eps): return 2 * x, None, 2 * residual +@pytest.fixture(autouse=True) +def default_vllm_config(): + mock_config = MagicMock() + mock_config.compilation_config.custom_ops = ["all"] + + with set_current_vllm_config(mock_config): + yield mock_config + + @pytest.mark.parametrize("is_310p", [True, False]) @pytest.mark.parametrize("residual", [None, torch.randn(4, 8, dtype=torch.float32)]) @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm) @patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm) def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual, - dummy_tensor): + dummy_tensor, default_vllm_config): with patch("vllm_ascend.utils.get_ascend_device_type", return_value=AscendDeviceType._310P diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py index 567c15d9325..51568f88418 100644 --- a/tests/ut/ops/test_rotary_embedding.py +++ b/tests/ut/ops/test_rotary_embedding.py @@ -78,6 +78,12 @@ class TestAscendRotaryEmbedding(unittest.TestCase): def setUp(self): # Common setup for tests + self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config') + self.mock_get_config = self.config_patcher.start() + mock_config = MagicMock() + mock_config.compilation_config.custom_ops = ["all"] + + self.mock_get_config.return_value = mock_config self.positions = torch.tensor([1, 2, 3]) self.query = torch.randn(3, 1, 32, dtype=torch.float16) self.key = torch.randn(3, 1, 32, dtype=torch.float16) @@ -242,6 +248,12 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase): def setUp(self): # Common setup for tests + self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config') + self.mock_get_config = self.config_patcher.start() + mock_config = MagicMock() + mock_config.compilation_config.custom_ops = ["all"] + + self.mock_get_config.return_value = mock_config self.positions = torch.tensor([1, 2, 3]) self.query = torch.randn(3, 1, 32, dtype=torch.float16) self.key = torch.randn(3, 1, 32, dtype=torch.float16) @@ -368,7 +380,11 @@ def test_yarn_get_mscale(self, mock_npuplatform): class TestAscendMRotaryEmbedding(unittest.TestCase): def setUp(self): - # Common setup for tests + self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config') + self.mock_get_config = self.config_patcher.start() + mock_config = MagicMock() + mock_config.compilation_config.custom_ops = ["all"] + self.mock_get_config.return_value = mock_config self.number_tokens = 3 self.num_head = 8 self.num_kvhead = 8 diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py index ff9e3cc1c1b..a1919b6b00d 100644 --- a/tests/ut/ops/test_token_dispatcher.py +++ b/tests/ut/ops/test_token_dispatcher.py @@ -29,6 +29,23 @@ class TestTokenDispatcherWithMC2(TestBase): def setUp(self): + self.config_patcher = patch( + 'vllm_ascend.ops.fused_moe.token_dispatcher.get_current_vllm_config' + ) + self.mock_get_config = self.config_patcher.start() + + mock_config = MagicMock() + + mock_config.scheduler_config.max_num_seqs = 256 + mock_config.scheduler_config.decode_max_num_seqs = 256 + + mock_config.compilation_config.custom_ops = ["all"] + + mock_config.speculative_config = None + + mock_config.parallel_config.tensor_parallel_size = 1 + + self.mock_get_config.return_value = mock_config self.mc2_group = MagicMock() self.mc2_group.device_group.return_value._get_backend.return_value.get_hccl_comm_name.return_value = "hccl_123" self.mc2_group.rank_in_group = 0 diff --git a/tests/ut/ops/test_vocab_parallel_embedding.py b/tests/ut/ops/test_vocab_parallel_embedding.py index 700da540f32..b09701be753 100644 --- a/tests/ut/ops/test_vocab_parallel_embedding.py +++ b/tests/ut/ops/test_vocab_parallel_embedding.py @@ -208,6 +208,15 @@ def test_output_shape(self): class TestAscendLogitsProcessor(unittest.TestCase): def setUp(self): + self.mock_vllm_config = MagicMock() + self.mock_vllm_config.compilation_config.custom_ops = ["all"] + + from vllm.config.vllm import set_current_vllm_config + set_current_vllm_config(self.mock_vllm_config) + + self.config_patch = patch("vllm.config.vllm.get_current_vllm_config", + return_value=self.mock_vllm_config) + self.config_patch.start() self.vocab_size = 50 self.num_embeddings = 50 self.embedding_dim = 10 diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py index 49d6c86eeb8..3b9fd9a8a9b 100644 --- a/tests/ut/worker/test_worker_v1.py +++ b/tests/ut/worker/test_worker_v1.py @@ -5,6 +5,7 @@ from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from tests.ut.base import TestBase +from vllm_ascend.utils import vllm_version_is init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules" @@ -52,7 +53,7 @@ def setUp(self): @patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.check_ascend_device_type") - @patch(init_cached_hf_modules_path) + @patch(init_cached_hf_modules_path, create=True) @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") def test_init_npu_worker_normal_case( self, @@ -106,7 +107,7 @@ def test_init_npu_worker_normal_case( @patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.check_ascend_device_type") - @patch(init_cached_hf_modules_path) + @patch(init_cached_hf_modules_path, create=True) @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") def test_init_npu_worker_with_trust_remote_code( self, @@ -140,7 +141,10 @@ def test_init_npu_worker_with_trust_remote_code( ) # Verify init_cached_hf_modules is called (trust_remote_code=True) - mock_init_cached_hf_modules.assert_called_once() + if vllm_version_is('0.13.0'): + mock_init_cached_hf_modules.assert_called_once() + else: + mock_init_cached_hf_modules.assert_not_called() @patch("vllm_ascend.utils.adapt_patch") @patch("vllm_ascend.ops") @@ -149,7 +153,7 @@ def test_init_npu_worker_with_trust_remote_code( @patch("vllm_ascend.worker.worker.get_ascend_config") @patch("vllm_ascend.worker.worker.init_ascend_config") @patch("vllm_ascend.worker.worker.check_ascend_device_type") - @patch(init_cached_hf_modules_path) + @patch(init_cached_hf_modules_path, create=True) @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler") def test_init_npu_worker_with_custom_cache_dtype( self, @@ -813,10 +817,11 @@ def test_execute_model_first_rank(self): mock_scheduler_output, None) self.assertEqual(result, mock_model_output) + @patch("vllm_ascend.worker.worker.enable_sp", return_value=False) @patch("vllm_ascend.worker.worker.get_pp_group") @patch("vllm_ascend.worker.worker.get_tp_group") def test_execute_model_middle_rank(self, mock_get_tp_group, - mock_get_pp_group): + mock_get_pp_group, mock_enable_sp): """Test execute_model method - middle rank case""" from vllm.sequence import IntermediateTensors @@ -1113,12 +1118,14 @@ def test_initialize_from_config_without_sleep_mode(self): worker.model_runner.initialize_kv_cache.assert_called_once_with( mock_kv_cache_config) + @patch("vllm_ascend.worker.worker.enable_sp", return_value=False) @patch("vllm_ascend.worker.worker.get_pp_group") @patch("vllm_ascend.worker.worker.get_tp_group") @patch("vllm_ascend.worker.worker.EMPTY_MODEL_RUNNER_OUTPUT") def test_execute_model_kv_connector_not_finished(self, mock_empty_output, mock_get_tp_group, - mock_get_pp_group): + mock_get_pp_group, + mock_enable_sp): """Test execute_model method - kv_connector_output not finished sending/recving case""" from vllm.sequence import IntermediateTensors diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index 38cc7fd336a..975e1100aa4 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -6,7 +6,6 @@ import torch_npu import vllm.envs as envs_vllm from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl -from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.config import VllmConfig, get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import logger @@ -39,12 +38,17 @@ from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, - weak_ref_tensors) + vllm_version_is, weak_ref_tensors) from vllm_ascend.worker.npu_input_batch import NPUInputBatch if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput +if vllm_version_is('0.13.0'): + from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore +else: + from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore + MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024 BUILD_METADATA_STEP_PREFILL = 0 BUILD_METADATA_STEP_DECODE = 1 diff --git a/vllm_ascend/ops/triton/mamba/causal_conv1d.py b/vllm_ascend/ops/triton/mamba/causal_conv1d.py index e24a5d8f1a1..29bae9c2125 100644 --- a/vllm_ascend/ops/triton/mamba/causal_conv1d.py +++ b/vllm_ascend/ops/triton/mamba/causal_conv1d.py @@ -13,7 +13,13 @@ import torch.nn.functional as F import triton import triton.language as tl -from vllm.attention.backends.utils import PAD_SLOT_ID + +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is('0.13.0'): + from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore +else: + from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore def causal_conv1d_ref( diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 80148785036..b1887e00e6d 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -1666,6 +1666,8 @@ def propose_draft_token_ids(sampled_token_ids): attn_metadata, aux_hidden_states, ) + if not vllm_version_is('0.13.0'): + self._copy_draft_token_ids_to_cpu(scheduler_output) ( logprobs_lists, @@ -1979,7 +1981,7 @@ def _build_dummy_attn_metadata( query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs + 1], _seq_lens_cpu=self.seq_lens.cpu[:num_reqs], - seq_lens=self.seq_lens.cpu[:num_reqs], + seq_lens=self.seq_lens.gpu[:num_reqs], num_reqs=num_reqs, num_actual_tokens=num_tokens, block_table_tensor=block_table_tensor[:num_reqs], diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 0094a0eb549..98ea9b4e1b1 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -132,11 +132,12 @@ def __init__( self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ self.cache_config.cache_dtype] - if self.model_config.trust_remote_code: - # note: lazy import to avoid importing torch before initializing - from vllm.utils.import_utils import init_cached_hf_modules + if vllm_version_is('0.13.0'): + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils.import_utils import init_cached_hf_modules - init_cached_hf_modules() + init_cached_hf_modules() self.profiler = self._init_profiler() if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: