From 8482bbdac2352b7fa9a8594504faf2f4767d8163 Mon Sep 17 00:00:00 2001
From: hfadzxy <starmoon_zhang@163.com>
Date: Wed, 7 Jan 2026 15:50:23 +0800
Subject: [PATCH] [Main2Main] Upgrade vllm commit to 0109

Signed-off-by: hfadzxy <starmoon_zhang@163.com>
---
 .github/workflows/_e2e_test.yaml              |  4 +--
 .github/workflows/bot_pr_create.yaml          |  2 +-
 .github/workflows/pr_test_full.yaml           |  2 +-
 .github/workflows/pr_test_light.yaml          |  6 ++--
 .../workflows/schedule_codecov_refresh.yaml   |  2 +-
 docs/source/community/versioning_policy.md    |  2 +-
 .../compile/test_norm_quant_fusion.py         | 19 +++++-----
 tests/ut/attention/test_attention_cp.py       |  5 +++
 tests/ut/attention/test_attention_v1.py       | 35 +++++++++++++++++++
 tests/ut/attention/test_mla_v1.py             | 17 +++++++++
 tests/ut/attention/test_sfa_v1.py             | 22 ++++++++++++
 tests/ut/ops/test_activation.py               | 19 ++++++++--
 tests/ut/ops/test_layernorm.py                | 14 ++++++--
 tests/ut/ops/test_rotary_embedding.py         | 18 +++++++++-
 tests/ut/ops/test_token_dispatcher.py         | 17 +++++++++
 tests/ut/ops/test_vocab_parallel_embedding.py |  9 +++++
 tests/ut/worker/test_worker_v1.py             | 19 ++++++----
 vllm_ascend/attention/mla_v1.py               |  8 +++--
 vllm_ascend/ops/triton/mamba/causal_conv1d.py |  8 ++++-
 vllm_ascend/worker/model_runner_v1.py         |  4 ++-
 vllm_ascend/worker/worker.py                  |  9 ++---
 21 files changed, 203 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
index 79c20073858..3a7acc1cc37 100644
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@@ -118,7 +118,7 @@ jobs:
           pytest -sv --durations=0 tests/e2e/singlecard/compile/test_norm_quant_fusion.py
   
           # model_runner_v2
-          pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py
+          # pytest -sv --durations=0 tests/e2e/singlecard/model_runner_v2/test_basic.py
 
           # pooling
           pytest -sv --durations=0 tests/e2e/singlecard/pooling/test_classification.py
@@ -309,7 +309,7 @@ jobs:
         run: |
           pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_data_parallel_tp2.py
           pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_kimi_k2.py
-          pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py
+          pytest -sv --durations=0 tests/e2e/multicard/4-cards/test_qwen3_next.py 
 
           # long_sequence
           pytest -sv --durations=0 tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py
diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
index 7b8b1cf1610..f5775be4525 100644
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=2f4e6548efec402b913ffddc8726230d9311948d
+          VLLM_COMMIT=bde38c11df0ea066a740efe9b77fff5418be45df
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
 
       - name: Checkout repository
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index a96fe250790..9468346e9e6 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0]
+        vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 77c3ef46018..147967ebb95 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 2f4e6548efec402b913ffddc8726230d9311948d
+      vllm: bde38c11df0ea066a740efe9b77fff5418be45df
   changes:
     runs-on: linux-aarch64-a2-0
     outputs:
@@ -81,7 +81,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0]
+        vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -93,7 +93,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0]
+        vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml
index ae9000cb7b5..371e2ec9fa7 100644
--- a/.github/workflows/schedule_codecov_refresh.yaml
+++ b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [2f4e6548efec402b913ffddc8726230d9311948d]
+        vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index d2530983943..1ecf381653f 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -51,7 +51,7 @@ If you're using v0.7.3, don't forget to install [mindie-turbo](https://pypi.org/
 For main branch of vLLM Ascend, we usually make it compatible with the latest vLLM release and a newer commit hash of vLLM. Please note that this table is usually updated. Please check it regularly.
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 2f4e6548efec402b913ffddc8726230d9311948d, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
+|     main    | bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0 tag | >= 3.10, < 3.12   | 8.3.RC2 | 2.8.0 / 2.8.0 |
 
 ## Release cadence
 
diff --git a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
index 1a335135ec9..057fe888e39 100644
--- a/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
+++ b/tests/e2e/singlecard/compile/test_norm_quant_fusion.py
@@ -305,15 +305,16 @@ def test_rmsnorm_quant_fusion(
 
     vllm_config = VllmConfig(model_config=ModelConfig(dtype=dtype))
 
-    update_environment_variables({
-        "RANK": "0",
-        "LOCAL_RANK": "0",
-        "WORLD_SIZE": "1",
-        "MASTER_ADDR": "localhost",
-        "MASTER_PORT": "12345",
-    })
-    init_distributed_environment()
-    ensure_model_parallel_initialized(1, 1)
+    with vllm.config.set_current_vllm_config(vllm_config):
+        update_environment_variables({
+            "RANK": "0",
+            "LOCAL_RANK": "0",
+            "WORLD_SIZE": "1",
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12345",
+        })
+        init_distributed_environment()
+        ensure_model_parallel_initialized(1, 1)
 
     with vllm.config.set_current_vllm_config(vllm_config):
         with set_ascend_forward_context(None, vllm_config):
diff --git a/tests/ut/attention/test_attention_cp.py b/tests/ut/attention/test_attention_cp.py
index cc518fdab53..487d416978c 100644
--- a/tests/ut/attention/test_attention_cp.py
+++ b/tests/ut/attention/test_attention_cp.py
@@ -33,6 +33,11 @@ def setUp(self):
         self.layer_no_quant.layer_name = "test_layer"
         self.layer_no_quant._k_scale_float = 1.0
         self.layer_no_quant._v_scale_float = 1.0
+        self.mock_vllm_config = MagicMock()
+        self.config_patcher = patch(
+            'vllm_ascend.attention.attention_v1.get_current_vllm_config',
+            return_value=self.mock_vllm_config)
+        self.config_patcher.start()
 
         self.impl = AscendAttentionCPImpl(
             num_heads=8,
diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py
index 0dbd5837f93..d57f4ef2a52 100644
--- a/tests/ut/attention/test_attention_v1.py
+++ b/tests/ut/attention/test_attention_v1.py
@@ -13,6 +13,23 @@
 
 class TestAscendAttentionBackend(TestBase):
 
+    def setUp(self):
+        self.mock_config = MagicMock()
+
+        mock_parallel_config = MagicMock()
+        mock_parallel_config.prefill_context_parallel_size = 1
+        mock_parallel_config.decode_context_parallel_size = 1
+
+        self.mock_config.parallel_config = mock_parallel_config
+
+        self.utils_patcher = patch(
+            'vllm_ascend.attention.utils.get_current_vllm_config',
+            return_value=self.mock_config)
+        self.utils_patcher.start()
+
+        from vllm_ascend.attention.utils import enable_cp
+        enable_cp.cache_clear()
+
     def test_get_name(self):
         self.assertEqual(AscendAttentionBackend.get_name(), "CUSTOM")
 
@@ -102,6 +119,19 @@ def test_build_non_310p(self, mock_soc_version, mock_ascend_metadata):
 class TestAscendAttentionBackendImpl(TestBase):
 
     def setUp(self):
+        self.mock_event = MagicMock()
+        self.mock_event.record.return_value = None
+        self.mock_event.wait.return_value = None
+
+        self.mock_stream = MagicMock()
+        self.event_patcher = patch('torch_npu.npu.Event',
+                                   return_value=self.mock_event)
+        self.stream_patcher = patch('torch_npu.npu.current_stream',
+                                    return_value=self.mock_stream)
+
+        self.event_patcher.start()
+        self.stream_patcher.start()
+
         self.layer = MagicMock()
         self.layer.layer_name = "test_layer"
         self.layer._k_scale_float = 1.0
@@ -119,6 +149,11 @@ def setUp(self):
         self.layer_no_quant.layer_name = "test_layer"
         self.layer_no_quant._k_scale_float = 1.0
         self.layer_no_quant._v_scale_float = 1.0
+        self.mock_vllm_config = MagicMock()
+        self.config_patcher = patch(
+            'vllm_ascend.attention.attention_v1.get_current_vllm_config',
+            return_value=self.mock_vllm_config)
+        self.config_patcher.start()
 
         self.impl = AscendAttentionBackendImpl(
             num_heads=8,
diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py
index 46a58626753..6d25fbba765 100755
--- a/tests/ut/attention/test_mla_v1.py
+++ b/tests/ut/attention/test_mla_v1.py
@@ -22,6 +22,23 @@
 
 class TestAscendMLABackend(TestBase):
 
+    def setUp(self):
+        self.mock_config = MagicMock()
+
+        mock_parallel_config = MagicMock()
+        mock_parallel_config.prefill_context_parallel_size = 1
+        mock_parallel_config.decode_context_parallel_size = 1
+
+        self.mock_config.parallel_config = mock_parallel_config
+
+        self.utils_patcher = patch(
+            'vllm_ascend.attention.utils.get_current_vllm_config',
+            return_value=self.mock_config)
+        self.utils_patcher.start()
+
+        from vllm_ascend.attention.utils import enable_cp
+        enable_cp.cache_clear()
+
     def test_get_name(self):
         self.assertEqual(AscendMLABackend.get_name(), "ASCEND_MLA")
 
diff --git a/tests/ut/attention/test_sfa_v1.py b/tests/ut/attention/test_sfa_v1.py
index 43023b6bb4e..2fdddf12192 100644
--- a/tests/ut/attention/test_sfa_v1.py
+++ b/tests/ut/attention/test_sfa_v1.py
@@ -12,6 +12,7 @@
 from vllm_ascend.attention.sfa_v1 import (AscendSFABackend, AscendSFAImpl,
                                           AscendSFAMetadata,
                                           AscendSFAMetadataBuilder)
+from vllm_ascend.utils import enable_dsa_cp
 
 
 class TestAscendSFABackend(TestBase):
@@ -83,6 +84,27 @@ def test_ascend_sfa_metadata_default(self):
 
 class TestAscendSFAMetadataBuilder(TestBase):
 
+    def setUp(self):
+        self.mock_cfg = MagicMock()
+
+        self.mock_cfg.parallel_config = MagicMock()
+        self.mock_cfg.parallel_config.tensor_parallel_size = 1
+        self.mock_cfg.parallel_config.prefill_context_parallel_size = 1
+        self.mock_cfg.parallel_config.decode_context_parallel_size = 1
+
+        self.mock_cfg.compilation_config = MagicMock()
+        self.mock_cfg.compilation_config.pass_config = MagicMock()
+        self.mock_cfg.compilation_config.pass_config.enable_sp = False
+
+        self.mock_cfg.speculative_config.num_speculative_tokens = 0
+
+        self.patcher = patch("vllm.config.get_current_vllm_config",
+                             return_value=self.mock_cfg)
+        self.patcher.start()
+
+        if hasattr(enable_dsa_cp, "cache_clear"):
+            enable_dsa_cp.cache_clear()
+
     def test_ascend_sfa_metadata_builder_default(self):
         kv_cache_spec = MagicMock()
         layer_names = ["layer1", "layer2"]
diff --git a/tests/ut/ops/test_activation.py b/tests/ut/ops/test_activation.py
index 9b80236570a..bf03aa5c49e 100644
--- a/tests/ut/ops/test_activation.py
+++ b/tests/ut/ops/test_activation.py
@@ -13,10 +13,11 @@
 # This file is a part of the vllm-ascend project.
 #
 
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
+from vllm.config import set_current_vllm_config
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
 
 from vllm_ascend.utils import AscendDeviceType
@@ -27,8 +28,20 @@ def dummy_tensor():
     return torch.randn(4, 8, dtype=torch.float16)
 
 
+@pytest.fixture
+def default_vllm_config():
+    mock_config = MagicMock()
+
+    mock_config.compilation_config.dispatch_forward_backend = "eager"
+
+    mock_config.compilation_config.custom_ops = ["all"]
+
+    with set_current_vllm_config(mock_config):
+        yield mock_config
+
+
 @patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1)
-def test_QuickGELU_forward(mock_gelu, dummy_tensor):
+def test_QuickGELU_forward(mock_gelu, dummy_tensor, default_vllm_config):
     layer = QuickGELU()
     out = layer.forward(dummy_tensor)
 
@@ -45,7 +58,7 @@ def test_QuickGELU_forward(mock_gelu, dummy_tensor):
        side_effect=lambda x: None)
 def test_SiluAndMul_forward(mock_maybe_prefetch_mlp_down_proj,
                             mock_maybe_wait_prefetch_done, mock_swiglu,
-                            is_310p, dummy_tensor):
+                            is_310p, dummy_tensor, default_vllm_config):
 
     with patch("vllm_ascend.utils.get_ascend_device_type",
                return_value=AscendDeviceType._310P
diff --git a/tests/ut/ops/test_layernorm.py b/tests/ut/ops/test_layernorm.py
index 03befc7e851..ce31f9785b3 100644
--- a/tests/ut/ops/test_layernorm.py
+++ b/tests/ut/ops/test_layernorm.py
@@ -1,7 +1,8 @@
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 import torch
+from vllm.config import set_current_vllm_config
 from vllm.model_executor.layers.layernorm import RMSNorm
 
 from vllm_ascend.utils import AscendDeviceType
@@ -20,13 +21,22 @@ def mock_add_rms_norm(x, residual, weight, eps):
     return 2 * x, None, 2 * residual
 
 
+@pytest.fixture(autouse=True)
+def default_vllm_config():
+    mock_config = MagicMock()
+    mock_config.compilation_config.custom_ops = ["all"]
+
+    with set_current_vllm_config(mock_config):
+        yield mock_config
+
+
 @pytest.mark.parametrize("is_310p", [True, False])
 @pytest.mark.parametrize("residual",
                          [None, torch.randn(4, 8, dtype=torch.float32)])
 @patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
 @patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm)
 def test_RMSNorm_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p, residual,
-                         dummy_tensor):
+                         dummy_tensor, default_vllm_config):
 
     with patch("vllm_ascend.utils.get_ascend_device_type",
                return_value=AscendDeviceType._310P
diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py
index 567c15d9325..51568f88418 100644
--- a/tests/ut/ops/test_rotary_embedding.py
+++ b/tests/ut/ops/test_rotary_embedding.py
@@ -78,6 +78,12 @@ class TestAscendRotaryEmbedding(unittest.TestCase):
 
     def setUp(self):
         # Common setup for tests
+        self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
+        self.mock_get_config = self.config_patcher.start()
+        mock_config = MagicMock()
+        mock_config.compilation_config.custom_ops = ["all"]
+
+        self.mock_get_config.return_value = mock_config
         self.positions = torch.tensor([1, 2, 3])
         self.query = torch.randn(3, 1, 32, dtype=torch.float16)
         self.key = torch.randn(3, 1, 32, dtype=torch.float16)
@@ -242,6 +248,12 @@ class TestAscendDeepseekScalingRotaryEmbedding(TestBase):
 
     def setUp(self):
         # Common setup for tests
+        self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
+        self.mock_get_config = self.config_patcher.start()
+        mock_config = MagicMock()
+        mock_config.compilation_config.custom_ops = ["all"]
+
+        self.mock_get_config.return_value = mock_config
         self.positions = torch.tensor([1, 2, 3])
         self.query = torch.randn(3, 1, 32, dtype=torch.float16)
         self.key = torch.randn(3, 1, 32, dtype=torch.float16)
@@ -368,7 +380,11 @@ def test_yarn_get_mscale(self, mock_npuplatform):
 class TestAscendMRotaryEmbedding(unittest.TestCase):
 
     def setUp(self):
-        # Common setup for tests
+        self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
+        self.mock_get_config = self.config_patcher.start()
+        mock_config = MagicMock()
+        mock_config.compilation_config.custom_ops = ["all"]
+        self.mock_get_config.return_value = mock_config
         self.number_tokens = 3
         self.num_head = 8
         self.num_kvhead = 8
diff --git a/tests/ut/ops/test_token_dispatcher.py b/tests/ut/ops/test_token_dispatcher.py
index ff9e3cc1c1b..a1919b6b00d 100644
--- a/tests/ut/ops/test_token_dispatcher.py
+++ b/tests/ut/ops/test_token_dispatcher.py
@@ -29,6 +29,23 @@
 class TestTokenDispatcherWithMC2(TestBase):
 
     def setUp(self):
+        self.config_patcher = patch(
+            'vllm_ascend.ops.fused_moe.token_dispatcher.get_current_vllm_config'
+        )
+        self.mock_get_config = self.config_patcher.start()
+
+        mock_config = MagicMock()
+
+        mock_config.scheduler_config.max_num_seqs = 256
+        mock_config.scheduler_config.decode_max_num_seqs = 256
+
+        mock_config.compilation_config.custom_ops = ["all"]
+
+        mock_config.speculative_config = None
+
+        mock_config.parallel_config.tensor_parallel_size = 1
+
+        self.mock_get_config.return_value = mock_config
         self.mc2_group = MagicMock()
         self.mc2_group.device_group.return_value._get_backend.return_value.get_hccl_comm_name.return_value = "hccl_123"
         self.mc2_group.rank_in_group = 0
diff --git a/tests/ut/ops/test_vocab_parallel_embedding.py b/tests/ut/ops/test_vocab_parallel_embedding.py
index 700da540f32..b09701be753 100644
--- a/tests/ut/ops/test_vocab_parallel_embedding.py
+++ b/tests/ut/ops/test_vocab_parallel_embedding.py
@@ -208,6 +208,15 @@ def test_output_shape(self):
 class TestAscendLogitsProcessor(unittest.TestCase):
 
     def setUp(self):
+        self.mock_vllm_config = MagicMock()
+        self.mock_vllm_config.compilation_config.custom_ops = ["all"]
+
+        from vllm.config.vllm import set_current_vllm_config
+        set_current_vllm_config(self.mock_vllm_config)
+
+        self.config_patch = patch("vllm.config.vllm.get_current_vllm_config",
+                                  return_value=self.mock_vllm_config)
+        self.config_patch.start()
         self.vocab_size = 50
         self.num_embeddings = 50
         self.embedding_dim = 10
diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py
index 49d6c86eeb8..3b9fd9a8a9b 100644
--- a/tests/ut/worker/test_worker_v1.py
+++ b/tests/ut/worker/test_worker_v1.py
@@ -5,6 +5,7 @@
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig
 
 from tests.ut.base import TestBase
+from vllm_ascend.utils import vllm_version_is
 
 init_cached_hf_modules_path = "vllm.utils.import_utils.init_cached_hf_modules"
 
@@ -52,7 +53,7 @@ def setUp(self):
     @patch("vllm_ascend.worker.worker.get_ascend_config")
     @patch("vllm_ascend.worker.worker.init_ascend_config")
     @patch("vllm_ascend.worker.worker.check_ascend_device_type")
-    @patch(init_cached_hf_modules_path)
+    @patch(init_cached_hf_modules_path, create=True)
     @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
     def test_init_npu_worker_normal_case(
         self,
@@ -106,7 +107,7 @@ def test_init_npu_worker_normal_case(
     @patch("vllm_ascend.worker.worker.get_ascend_config")
     @patch("vllm_ascend.worker.worker.init_ascend_config")
     @patch("vllm_ascend.worker.worker.check_ascend_device_type")
-    @patch(init_cached_hf_modules_path)
+    @patch(init_cached_hf_modules_path, create=True)
     @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
     def test_init_npu_worker_with_trust_remote_code(
         self,
@@ -140,7 +141,10 @@ def test_init_npu_worker_with_trust_remote_code(
         )
 
         # Verify init_cached_hf_modules is called (trust_remote_code=True)
-        mock_init_cached_hf_modules.assert_called_once()
+        if vllm_version_is('0.13.0'):
+            mock_init_cached_hf_modules.assert_called_once()
+        else:
+            mock_init_cached_hf_modules.assert_not_called()
 
     @patch("vllm_ascend.utils.adapt_patch")
     @patch("vllm_ascend.ops")
@@ -149,7 +153,7 @@ def test_init_npu_worker_with_trust_remote_code(
     @patch("vllm_ascend.worker.worker.get_ascend_config")
     @patch("vllm_ascend.worker.worker.init_ascend_config")
     @patch("vllm_ascend.worker.worker.check_ascend_device_type")
-    @patch(init_cached_hf_modules_path)
+    @patch(init_cached_hf_modules_path, create=True)
     @patch("vllm_ascend.worker.worker.NPUWorker._init_profiler")
     def test_init_npu_worker_with_custom_cache_dtype(
         self,
@@ -813,10 +817,11 @@ def test_execute_model_first_rank(self):
                 mock_scheduler_output, None)
             self.assertEqual(result, mock_model_output)
 
+    @patch("vllm_ascend.worker.worker.enable_sp", return_value=False)
     @patch("vllm_ascend.worker.worker.get_pp_group")
     @patch("vllm_ascend.worker.worker.get_tp_group")
     def test_execute_model_middle_rank(self, mock_get_tp_group,
-                                       mock_get_pp_group):
+                                       mock_get_pp_group, mock_enable_sp):
         """Test execute_model method - middle rank case"""
         from vllm.sequence import IntermediateTensors
 
@@ -1113,12 +1118,14 @@ def test_initialize_from_config_without_sleep_mode(self):
             worker.model_runner.initialize_kv_cache.assert_called_once_with(
                 mock_kv_cache_config)
 
+    @patch("vllm_ascend.worker.worker.enable_sp", return_value=False)
     @patch("vllm_ascend.worker.worker.get_pp_group")
     @patch("vllm_ascend.worker.worker.get_tp_group")
     @patch("vllm_ascend.worker.worker.EMPTY_MODEL_RUNNER_OUTPUT")
     def test_execute_model_kv_connector_not_finished(self, mock_empty_output,
                                                      mock_get_tp_group,
-                                                     mock_get_pp_group):
+                                                     mock_get_pp_group,
+                                                     mock_enable_sp):
         """Test execute_model method - kv_connector_output not finished sending/recving case"""
         from vllm.sequence import IntermediateTensors
 
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
index 38cc7fd336a..975e1100aa4 100644
--- a/vllm_ascend/attention/mla_v1.py
+++ b/vllm_ascend/attention/mla_v1.py
@@ -6,7 +6,6 @@
 import torch_npu
 import vllm.envs as envs_vllm
 from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
-from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import logger
@@ -39,12 +38,17 @@
 from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
 from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, maybe_trans_nz,
-                               weak_ref_tensors)
+                               vllm_version_is, weak_ref_tensors)
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
 
+if vllm_version_is('0.13.0'):
+    from vllm.attention.backends.utils import PAD_SLOT_ID  # type: ignore
+else:
+    from vllm.v1.attention.backends.utils import PAD_SLOT_ID  # type: ignore
+
 MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
 BUILD_METADATA_STEP_PREFILL = 0
 BUILD_METADATA_STEP_DECODE = 1
diff --git a/vllm_ascend/ops/triton/mamba/causal_conv1d.py b/vllm_ascend/ops/triton/mamba/causal_conv1d.py
index e24a5d8f1a1..29bae9c2125 100644
--- a/vllm_ascend/ops/triton/mamba/causal_conv1d.py
+++ b/vllm_ascend/ops/triton/mamba/causal_conv1d.py
@@ -13,7 +13,13 @@
 import torch.nn.functional as F
 import triton
 import triton.language as tl
-from vllm.attention.backends.utils import PAD_SLOT_ID
+
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is('0.13.0'):
+    from vllm.attention.backends.utils import PAD_SLOT_ID  # type: ignore
+else:
+    from vllm.v1.attention.backends.utils import PAD_SLOT_ID  # type: ignore
 
 
 def causal_conv1d_ref(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 80148785036..b1887e00e6d 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1666,6 +1666,8 @@ def propose_draft_token_ids(sampled_token_ids):
                 attn_metadata,
                 aux_hidden_states,
             )
+            if not vllm_version_is('0.13.0'):
+                self._copy_draft_token_ids_to_cpu(scheduler_output)
 
         (
             logprobs_lists,
@@ -1979,7 +1981,7 @@ def _build_dummy_attn_metadata(
                     query_start_loc_cpu=self.query_start_loc.cpu[:num_reqs +
                                                                  1],
                     _seq_lens_cpu=self.seq_lens.cpu[:num_reqs],
-                    seq_lens=self.seq_lens.cpu[:num_reqs],
+                    seq_lens=self.seq_lens.gpu[:num_reqs],
                     num_reqs=num_reqs,
                     num_actual_tokens=num_tokens,
                     block_table_tensor=block_table_tensor[:num_reqs],
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
index 0094a0eb549..98ea9b4e1b1 100644
--- a/vllm_ascend/worker/worker.py
+++ b/vllm_ascend/worker/worker.py
@@ -132,11 +132,12 @@ def __init__(
             self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype]
 
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils.import_utils import init_cached_hf_modules
+        if vllm_version_is('0.13.0'):
+            if self.model_config.trust_remote_code:
+                # note: lazy import to avoid importing torch before initializing
+                from vllm.utils.import_utils import init_cached_hf_modules
 
-            init_cached_hf_modules()
+                init_cached_hf_modules()
 
         self.profiler = self._init_profiler()
         if vllm_config.model_config and vllm_config.model_config.enable_sleep_mode: