diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml
index 4bf3fc082ef..2030b570477 100644
--- a/.github/workflows/bot_pr_create.yaml
+++ b/.github/workflows/bot_pr_create.yaml
@@ -37,7 +37,7 @@ jobs:
     steps:
       - name: Get vLLM version
         run: |
-          VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
+          VLLM_COMMIT=5b3ba94ab4bd9da739bcc27cdd05505467fa499e
           echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"
 
       - name: Checkout repository
diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index 64068c64939..7f3ae9477fe 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=4034c3d32e30d01639459edd3ab486f56993876d
+ARG VLLM_COMMIT=5b3ba94ab4bd9da739bcc27cdd05505467fa499e
 RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
     cd /vllm-workspace/vllm && \
     git checkout $VLLM_COMMIT
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index dfa0b74bc3d..c5bc6d5c138 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -75,7 +75,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 76664e0fbb2..be0b76d5803 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: 4034c3d32e30d01639459edd3ab486f56993876d
+      vllm: 5b3ba94ab4bd9da739bcc27cdd05505467fa499e
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -90,7 +90,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -102,7 +102,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0]
+        vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml
index dd4f2c8484b..a149e744633 100644
--- a/.github/workflows/schedule_codecov_refresh.yaml
+++ b/.github/workflows/schedule_codecov_refresh.yaml
@@ -33,7 +33,7 @@ jobs:
     name: refresh codecov
     strategy:
       matrix:
-        vllm_version: [4034c3d32e30d01639459edd3ab486f56993876d]
+        vllm_version: [5b3ba94ab4bd9da739bcc27cdd05505467fa499e]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md
index 65bcad440b5..0b003d474af 100644
--- a/docs/source/community/versioning_policy.md
+++ b/docs/source/community/versioning_policy.md
@@ -59,7 +59,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL
 
 | vLLM Ascend | vLLM         | Python           | Stable CANN | PyTorch/torch_npu  |
 |-------------|--------------|------------------|-------------|--------------------|
-|     main    | 4034c3d32e30d01639459edd3ab486f56993876d, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
+|     main    | 5b3ba94ab4bd9da739bcc27cdd05505467fa499e, v0.17.0 tag | >= 3.10, < 3.12   | 8.5.0 | 2.9.0 / 2.9.0 |
 
 ## Release cadence
 
diff --git a/vllm_ascend/compilation/compiler_interface.py b/vllm_ascend/compilation/compiler_interface.py
index 22b6f8a1952..ff7b31a4fc6 100644
--- a/vllm_ascend/compilation/compiler_interface.py
+++ b/vllm_ascend/compilation/compiler_interface.py
@@ -17,6 +17,7 @@
 #
 import copy
 import functools
+import logging
 from collections.abc import Callable
 from typing import Any
 
@@ -33,6 +34,40 @@
 from vllm_ascend.ascend_config import AscendCompilationConfig, get_ascend_config
 from vllm_ascend.utils import COMPILATION_PASS_KEY
 
+logger = logging.getLogger(__name__)
+
+
+def convert_fake_inputs_to_current_fake_mode(example_inputs: list[Any]) -> list[Any]:
+    """Fix for FakeTensorMode mismatch issue in vllm upgrade
+
+    The piecewise backend now compiles ranges upfront in __init__, which may use
+    fake tensors from graph placeholder nodes that have a different FakeTensorMode
+    than the current tracing context. We need to ensure consistent fake mode.
+    """
+    from torch._guards import detect_fake_mode
+
+    current_fake_mode = detect_fake_mode()
+    if current_fake_mode is not None:
+        # Convert example_inputs to use the current fake mode if they are fake tensors
+        # from a different fake mode
+        converted_inputs = []
+        for inp in example_inputs:
+            if isinstance(inp, torch.Tensor):
+                # Check if this is a fake tensor that needs conversion
+                if hasattr(inp, "fake_mode") and inp.fake_mode is not current_fake_mode:
+                    # Convert to current fake mode
+                    old_fake_mode = inp.fake_mode
+                    converted_inputs.append(current_fake_mode.from_tensor(inp))
+                    logger.debug("Converting fake tensor from fake_mode %s to %s", old_fake_mode, current_fake_mode)
+                else:
+                    converted_inputs.append(inp)
+            else:
+                converted_inputs.append(inp)
+        return converted_inputs
+    else:
+        logger.warning("detect_fake_mode() returned None. FakeTensorMode mismatch fix may not be applied.")
+        return example_inputs
+
 
 def compile_fx(graph: GraphModule, example_inputs: list, inner_compile: Callable, decompositions: dict) -> Callable:
     recursive_compile_fx = functools.partial(compile_fx, inner_compile=inner_compile, decompositions=decompositions)
@@ -49,6 +84,9 @@ def fusion_pass_compile(
     compile_range: Range,
     key: str | None = None,
 ) -> tuple[Callable | None, Any | None]:
+    # Fix for FakeTensorMode mismatch issue in vllm upgrade
+    example_inputs = convert_fake_inputs_to_current_fake_mode(example_inputs)
+
     def compile_inner(graph, example_inputs):
         current_pass_manager = compiler_config[COMPILATION_PASS_KEY]
         graph = current_pass_manager(graph)
@@ -101,6 +139,15 @@ def npugraph_ex_compile(
 
     npugraph_ex = torchair.get_npu_backend(compiler_config=config)
 
+    # Apply graph fusion passes (including GELU replacement) before torchair compilation
+    # This is needed to replace unsupported operations like aten::gelu with NPU-compatible versions
+    if COMPILATION_PASS_KEY in compiler_config:
+        current_pass_manager = compiler_config[COMPILATION_PASS_KEY]
+        graph = current_pass_manager(graph)
+
+    # Fix for FakeTensorMode mismatch issue in vllm upgrade
+    example_inputs = convert_fake_inputs_to_current_fake_mode(example_inputs)
+
     # torch.compile requires the output of the fx graph to be a tuple
     if not graph_returns_tuple(graph):
         return make_graph_return_tuple(graph, example_inputs, npugraph_ex), None
diff --git a/vllm_ascend/compilation/graph_fusion_pass_manager.py b/vllm_ascend/compilation/graph_fusion_pass_manager.py
index 40acb081219..bb500d78396 100644
--- a/vllm_ascend/compilation/graph_fusion_pass_manager.py
+++ b/vllm_ascend/compilation/graph_fusion_pass_manager.py
@@ -73,3 +73,11 @@ def configure(self, config: VllmConfig):
             from .passes.sequence_parallelism import AscendSequenceParallelismPass
 
             self.passes.append(AscendSequenceParallelismPass(config))
+
+        # GELU replacement pass - needed for models like Whisper that use GELU
+        # which is not natively supported on NPU. Uses torch_npu.npu_gelu
+        # which provides exact GELU computation on NPU devices.
+        if self.ascend_compilation_config.get("replace_gelu", True):
+            from .passes.gelu_replacement_pass import GeluReplacementPass
+
+            self.passes.append(GeluReplacementPass(config))
diff --git a/vllm_ascend/compilation/passes/gelu_replacement_pass.py b/vllm_ascend/compilation/passes/gelu_replacement_pass.py
new file mode 100644
index 00000000000..40ea6503721
--- /dev/null
+++ b/vllm_ascend/compilation/passes/gelu_replacement_pass.py
@@ -0,0 +1,210 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import annotations
+
+import torch
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from vllm.config import VllmConfig
+from vllm.config.compilation import Range
+from vllm.logger import logger
+
+from vllm_ascend.compilation.passes.base_pattern import BasePattern
+from vllm_ascend.utils import vllm_version_is
+
+if vllm_version_is("0.15.0"):
+    from vllm.compilation.vllm_inductor_pass import VllmInductorPass  # type: ignore
+else:
+    from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
+
+
+class GeluPattern(BasePattern):
+    """
+    Pattern that matches torch.ops.aten.gelu.default and replaces it with
+    an NPU-compatible implementation using torch_npu.npu_gelu.
+
+    This is needed because aten::gelu is not supported on NPU and falls back
+    to CPU, which causes errors during graph capture due to host-device
+    synchronization restrictions.
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        """
+        Generate example inputs for the GeluPattern.
+        """
+        x = torch.randn(2, 2048, device="npu", dtype=self.dtype)
+        return [x]
+
+    def get_pattern(self):
+        def pattern(x: torch.Tensor):
+            """
+            Pattern for standard GELU activation.
+            Note: We don't pass the approximate argument here to match
+            the default case where approximate="none" is used implicitly.
+            """
+            return torch.ops.aten.gelu.default(x)
+
+        return pattern
+
+    def get_replacement(self):
+        def replacement(x: torch.Tensor):
+            """
+            Replacement that uses NPU-compatible GELU implementation.
+
+            Uses torch_npu.npu_gelu which provides exact GELU computation
+            on NPU devices, avoiding the CPU fallback.
+            """
+            import torch_npu
+
+            return torch_npu.npu_gelu(x)
+
+        return replacement
+
+
+class GeluInplacePattern(BasePattern):
+    """
+    Pattern that matches torch.ops.aten.gelu_.default (in-place GELU) and
+    replaces it with an NPU-compatible implementation.
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        """
+        Generate example inputs for the GeluInplacePattern.
+        """
+        x = torch.randn(2, 2048, device="npu", dtype=self.dtype)
+        return [x]
+
+    def get_pattern(self):
+        def pattern(x: torch.Tensor):
+            """
+            Pattern for in-place GELU activation.
+            """
+            return torch.ops.aten.gelu_.default(x)
+
+        return pattern
+
+    def get_replacement(self):
+        def replacement(x: torch.Tensor):
+            """
+            Replacement that uses NPU-compatible GELU implementation.
+
+            Since npu_gelu is not in-place, we copy the result back.
+            """
+            import torch_npu
+
+            out = torch_npu.npu_gelu(x)
+            x.copy_(out)
+            return x
+
+        return replacement
+
+
+class GeluOutPattern(BasePattern):
+    """
+    Pattern that matches torch.ops.aten.gelu.out (out-of-place GELU with
+    pre-allocated output tensor) and replaces it with an NPU-compatible
+    implementation.
+
+    This variant is generated by torch.compile when compiling graphs with
+    upfront compilation (e.g., after vllm commit 5569f5218 which stops lazy
+    compilation). The signature is:
+        aten::gelu.out(Tensor self, *, str approximate="none", Tensor(a!) out)
+        -> Tensor(a!)
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+
+    def get_inputs(self) -> list[torch.Tensor]:
+        """
+        Generate example inputs for the GeluOutPattern.
+        The order must match how the function is called: gelu.out(x, out=out)
+        """
+        x = torch.randn(2, 2048, device="npu", dtype=self.dtype)
+        out = torch.empty_like(x)
+        return [x, out]
+
+    def get_pattern(self):
+        def pattern(x: torch.Tensor, out: torch.Tensor):
+            """
+            Pattern for GELU with output tensor.
+            Matches: torch.ops.aten.gelu.out(x, out=out)
+            """
+            return torch.ops.aten.gelu.out(x, out=out)
+
+        return pattern
+
+    def get_replacement(self):
+        def replacement(x: torch.Tensor, out: torch.Tensor):
+            """
+            Replacement that uses NPU-compatible GELU implementation.
+
+            Computes npu_gelu and copies the result to the output tensor.
+            """
+            import torch_npu
+
+            result = torch_npu.npu_gelu(x)
+            out.copy_(result)
+            return out
+
+        return replacement
+
+
+class GeluReplacementPass(VllmInductorPass):
+    """
+    A pass that replaces aten::gelu operations with NPU-compatible
+    implementations to enable graph capture on Ascend NPU devices.
+
+    Handles three variants:
+    - gelu.default: Standard GELU
+    - gelu_.default: In-place GELU
+    - gelu.out: GELU with pre-allocated output tensor
+    """
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+        self.pattern_match_passes: PatternMatcherPass = PatternMatcherPass(pass_name="gelu_replacement_pass")
+
+        dtype = vllm_config.model_config.dtype
+        if dtype not in (torch.float16, torch.bfloat16, torch.float32):
+            logger.debug("GELU replacement not enabled: unsupported dtype %s", dtype)
+            return
+
+        # Register all GELU patterns
+        GeluPattern(vllm_config).register(self.pattern_match_passes)
+        GeluInplacePattern(vllm_config).register(self.pattern_match_passes)
+        GeluOutPattern(vllm_config).register(self.pattern_match_passes)
+
+    def __call__(self, graph: torch.fx.Graph) -> None:  # type: ignore[override]
+        self.begin()
+        self.matched_count = self.pattern_match_passes.apply(graph)
+        if self.matched_count > 0:
+            logger.debug("Replaced %s gelu operations with NPU-compatible version", self.matched_count)
+        self.end_and_log()
+
+    def is_applicable_for_range(self, compile_range: Range) -> bool:
+        """
+        GELU replacement is always applicable for all compile ranges.
+        """
+        return True
diff --git a/vllm_ascend/patch/platform/__init__.py b/vllm_ascend/patch/platform/__init__.py
index 4b8fc9d256f..f355c09332c 100644
--- a/vllm_ascend/patch/platform/__init__.py
+++ b/vllm_ascend/patch/platform/__init__.py
@@ -21,6 +21,7 @@
 import vllm_ascend.patch.platform.patch_mamba_config  # noqa
 import vllm_ascend.patch.platform.patch_minimax_m2_config  # noqa
 import vllm_ascend.patch.platform.patch_sched_yield  # noqa
+import vllm_ascend.patch.platform.patch_torch_accelerator  # noqa
 
 if os.getenv("DYNAMIC_EPLB", "false").lower() in ("true", "1") or os.getenv("EXPERT_MAP_RECORD", "false") == "true":
     import vllm_ascend.patch.platform.patch_multiproc_executor  # noqa
diff --git a/vllm_ascend/patch/platform/patch_multiproc_executor.py b/vllm_ascend/patch/platform/patch_multiproc_executor.py
index 50f74e60e8d..524e8db401e 100644
--- a/vllm_ascend/patch/platform/patch_multiproc_executor.py
+++ b/vllm_ascend/patch/platform/patch_multiproc_executor.py
@@ -1,4 +1,5 @@
-import threading
+from __future__ import annotations
+
 import weakref
 from collections import deque
 from collections.abc import Callable
@@ -26,7 +27,6 @@ def _init_executor(self) -> None:
         # and ensure workers will be terminated.
         self._finalizer = weakref.finalize(self, self.shutdown)
         self.is_failed = False
-        self.shutdown_event = threading.Event()
         self.failure_callback: FailureCallback | None = None
 
         tensor_parallel_size, pp_parallel_size, pcp_parallel_size = self._get_parallel_sizes()
@@ -66,20 +66,28 @@ def _init_executor(self) -> None:
         success = False
         try:
             global_start_rank = self.local_world_size * self.parallel_config.node_rank_within_dp
+            # When using fork, keep track of socket file descriptors that are
+            # inherited by the worker, so that we can close them in subsequent
+            # workers
+            inherited_fds: list[int] | None = [] if context.get_start_method() == "fork" else None
+
             for local_rank in range(self.local_world_size):
                 global_rank = global_start_rank + local_rank
                 is_driver_worker = self._is_driver_worker(global_rank)
-                unready_workers.append(
-                    AscendWorkerProc.make_worker_process(
-                        vllm_config=self.vllm_config,
-                        local_rank=local_rank,
-                        rank=global_rank,
-                        distributed_init_method=distributed_init_method,
-                        input_shm_handle=scheduler_output_handle,
-                        shared_worker_lock=shared_worker_lock,
-                        is_driver_worker=is_driver_worker,
-                    )
+                unready_worker_handle = AscendWorkerProc.make_worker_process(
+                    vllm_config=self.vllm_config,
+                    local_rank=local_rank,
+                    rank=global_rank,
+                    distributed_init_method=distributed_init_method,
+                    input_shm_handle=scheduler_output_handle,
+                    shared_worker_lock=shared_worker_lock,
+                    is_driver_worker=is_driver_worker,
+                    inherited_fds=inherited_fds,
                 )
+                unready_workers.append(unready_worker_handle)
+                if inherited_fds is not None:
+                    inherited_fds.append(unready_worker_handle.death_writer.fileno())
+                    inherited_fds.append(unready_worker_handle.ready_pipe.fileno())
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
@@ -124,6 +132,7 @@ def _init_executor(self) -> None:
                 for uw in unready_workers:
                     if uw.death_writer is not None:
                         uw.death_writer.close()
+                        uw.death_writer = None
                 self._ensure_worker_termination([uw.proc for uw in unready_workers])
 
         self.output_rank = self._get_output_rank()
@@ -158,24 +167,28 @@ def make_worker_process(
         input_shm_handle,  # Receive SchedulerOutput
         shared_worker_lock: LockType,
         is_driver_worker: bool = False,
+        inherited_fds: list[int] | None = None,
     ) -> UnreadyWorkerProcHandle:
         context = get_mp_context()
-        # (reader, writer)
-        reader, writer = context.Pipe(duplex=False)
-
-        # Create death pipe to detect parent process exit
+        # Ready pipe to communicate readiness from child to parent
+        ready_reader, ready_writer = context.Pipe(duplex=False)
+        # Death pipe to let child detect parent process exit
         death_reader, death_writer = context.Pipe(duplex=False)
-
+        if inherited_fds is not None:
+            inherited_fds = inherited_fds.copy()
+            inherited_fds.extend((ready_reader.fileno(), death_writer.fileno()))
         process_kwargs = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
             "rank": rank,
             "distributed_init_method": distributed_init_method,
             "input_shm_handle": input_shm_handle,
-            "ready_pipe": (reader, writer),
+            "ready_pipe": ready_writer,
             "death_pipe": death_reader,
             "shared_worker_lock": shared_worker_lock,
             "is_driver_worker": is_driver_worker,
+            # Have the worker close parent end of this worker's pipes too
+            "inherited_fds": inherited_fds if inherited_fds is not None else [],
         }
         # Run EngineCore busy loop in background process.
         proc = context.Process(
@@ -186,10 +199,12 @@ def make_worker_process(
         )
 
         proc.start()
-        writer.close()
+        # Close child ends of pipes here in the parent
+        ready_writer.close()
+        death_reader.close()
         # Keep death_writer open in parent - when parent exits,
         # death_reader in child will get EOFError
-        return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
+        return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
 
 
 vllm.v1.executor.multiproc_executor.MultiprocExecutor = AscendMultiprocExecutor
diff --git a/vllm_ascend/patch/platform/patch_torch_accelerator.py b/vllm_ascend/patch/platform/patch_torch_accelerator.py
new file mode 100644
index 00000000000..431dce4e51b
--- /dev/null
+++ b/vllm_ascend/patch/platform/patch_torch_accelerator.py
@@ -0,0 +1,8 @@
+import torch
+
+
+def patch_empty_cache() -> None:
+    torch.npu.empty_cache()
+
+
+torch.accelerator.empty_cache = patch_empty_cache
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
index aa86823fe45..f7f33125dc2 100644
--- a/vllm_ascend/spec_decode/eagle_proposer.py
+++ b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -46,7 +46,7 @@
 from vllm_ascend.compilation.acl_graph import ACLGraphWrapper, update_full_graph_params
 from vllm_ascend.ops.triton.spec_decode.utils import prepare_inputs_padded_kernel
 from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num
-from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled, vllm_version_is
+from vllm_ascend.utils import enable_sp, lmhead_tp_enable, shared_expert_dp_enabled
 
 # Currently we will fix block size to a small one since `num_reqs` can't be too large
 _PREPARE_INPUTS_BLOCK_SIZE = 4
@@ -663,24 +663,7 @@ def _propose(
                 if not self.parallel_drafting:
                     for draft_step in range(1, self.num_speculative_tokens):
                         per_layer_attn_metadata = dict()
-                        if vllm_version_is("0.17.0"):
-                            for attn_group in self.draft_attn_groups:
-                                common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
-                                    draft_step,
-                                    attn_metadata,
-                                    common_attn_metadata,
-                                    batch_size,
-                                    num_input_tokens,
-                                    used_update_positions,
-                                    aclgraph_runtime_mode,
-                                    ori_seq_len,
-                                    slot_indices,
-                                    mtp_slot_mapping,
-                                    attn_group=attn_group,
-                                )
-                                for layer_name in self.attn_layer_names:
-                                    per_layer_attn_metadata[layer_name] = attn_metadata
-                        else:
+                        for attn_group in self.draft_attn_groups:
                             common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
                                 draft_step,
                                 attn_metadata,
@@ -692,6 +675,7 @@ def _propose(
                                 ori_seq_len,
                                 slot_indices,
                                 mtp_slot_mapping,
+                                attn_group=attn_group,
                             )
                             for layer_name in self.attn_layer_names:
                                 per_layer_attn_metadata[layer_name] = attn_metadata
@@ -701,21 +685,7 @@ def _propose(
             if not self.parallel_drafting:
                 for draft_step in range(1, self.num_speculative_tokens):
                     per_layer_attn_metadata = dict()
-                    if vllm_version_is("0.17.0"):
-                        for attn_group in self.draft_attn_groups:
-                            common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
-                                draft_step,
-                                attn_metadata,
-                                common_attn_metadata,
-                                batch_size,
-                                num_input_tokens,
-                                used_update_positions,
-                                aclgraph_runtime_mode,
-                                attn_group=attn_group,
-                            )
-                            for layer_name in self.attn_layer_names:
-                                per_layer_attn_metadata[layer_name] = attn_metadata
-                    else:
+                    for attn_group in self.draft_attn_groups:
                         common_attn_metadata, attn_metadata = self.attn_update_stack_num_spec_norm(
                             draft_step,
                             attn_metadata,
@@ -724,6 +694,7 @@ def _propose(
                             num_input_tokens,
                             used_update_positions,
                             aclgraph_runtime_mode,
+                            attn_group=attn_group,
                         )
                         for layer_name in self.attn_layer_names:
                             per_layer_attn_metadata[layer_name] = attn_metadata
@@ -1064,16 +1035,11 @@ def set_inputs_first_pass(
             # 2.
             # Recompute the slot mapping based on the new positions and
             # rejection mask.
-            if vllm_version_is("0.17.0"):
-                # Use the first draft attention group's kv_cache_spec for block_size
-                # (all draft layers share the same kv-cache group)
-                assert len(self.draft_attn_groups) > 0
-                block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
-            else:
-                if self.attn_metadata_builder is None:
-                    block_size = self._get_attention_metadata_builder().kv_cache_spec.block_size
-                else:
-                    block_size = self.attn_metadata_builder.kv_cache_spec.block_size
+            # Use the first draft attention group's kv_cache_spec for block_size
+            # (all draft layers share the same kv-cache group)
+            assert len(self.draft_attn_groups) > 0
+            block_size = self.draft_attn_groups[0].kv_cache_spec.block_size
+
             new_slot_mapping = compute_new_slot_mapping(
                 cad=cad,
                 new_positions=self.positions[:total_num_output_tokens],
@@ -1112,8 +1078,7 @@ def attn_update_stack_num_spec_norm(
         attn_group=None,
     ):
         assert draft_step > 0
-        if vllm_version_is("0.17.0"):
-            assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
+        assert attn_group is not None, "vllm-ascend v0.17.0rc1 requires attn_group"
         common_attn_metadata = self.shallow_copy_metadata(old_common_metadata)
 
         if draft_step == 1:
@@ -1224,13 +1189,7 @@ def attn_update_stack_num_spec_norm(
             # Set the address of the attn_metadata.slot_mapping to the self.slot_mapping_group[idx]
             common_attn_metadata.slot_mapping = self.slot_mapping_group[draft_step]
 
-        if vllm_version_is("0.17.0"):
-            attn_metadata_builder = attn_group.get_metadata_builder()
-        else:
-            if self.attn_metadata_builder is None:
-                attn_metadata_builder = self._get_attention_metadata_builder()
-            else:
-                attn_metadata_builder = self.attn_metadata_builder
+        attn_metadata_builder = attn_group.get_metadata_builder()
 
         attn_metadata = attn_metadata_builder.build_for_drafting(
             common_attn_metadata=common_attn_metadata,
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 017ca2d32db..98542907670 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -126,7 +126,6 @@
     is_moe_model,
     lmhead_tp_enable,
     set_weight_prefetch_method,
-    vllm_version_is,
 )
 from vllm_ascend.worker.npu_input_batch import NPUInputBatch
 from vllm_ascend.worker.pcp_utils import PCPManager
@@ -398,15 +397,14 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.cpu_slot_mapping = None
         self.sampling_done_event: torch.npu.Event | None = None
 
-        if vllm_version_is("0.17.0"):
-            # self.cudagraph_batch_sizes sorts in ascending order.
-            if (
-                self.compilation_config.cudagraph_capture_sizes
-                and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            ):
-                self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
-            else:
-                self.cudagraph_batch_sizes = []
+        # self.cudagraph_batch_sizes sorts in ascending order.
+        if (
+            self.compilation_config.cudagraph_capture_sizes
+            and self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+        ):
+            self.cudagraph_batch_sizes = sorted(self.compilation_config.cudagraph_capture_sizes)
+        else:
+            self.cudagraph_batch_sizes = []
         self.mamba_state_idx: dict[str, int] = {}
         self._mamba_copy_bufs: mamba_utils.MambaCopyBuffers | None = None
 
@@ -1362,7 +1360,7 @@ def execute_model(
                 skip_compiled=has_encoder_input,
             ),
             self.maybe_get_kv_connector_output(
-                scheduler_output, clear_metadata=clear_kv_metadata
+                scheduler_output, defer_finalize=not clear_kv_metadata
             ) as kv_connector_output,
         ):
             hidden_states = self._model_forward(
@@ -2565,14 +2563,13 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
         self.may_reinitialize_input_batch(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
-        if vllm_version_is("0.17.0"):
-            # TODO: refactor the logic of attention
-            # Initialize drafter attention group initialization
-            if self.speculative_config and (
-                self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
-            ):
-                assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
-                self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
+        # TODO: refactor the logic of attention
+        # Initialize drafter attention group initialization
+        if self.speculative_config and (
+            self.speculative_config.use_eagle() or self.speculative_config.uses_draft_model()
+        ):
+            assert isinstance(self.drafter, AscendEagleProposer | AscendDraftModelProposer)
+            self.drafter.initialize_attn_backend(kv_cache_config, self.kernel_block_sizes)
 
         if has_kv_transfer_group():
             get_kv_transfer_group().register_kv_caches(kv_caches)
@@ -2962,7 +2959,7 @@ def may_reinitialize_input_batch(self, kv_cache_config: KVCacheConfig) -> None:
             max_num_blocks.append(max_num_blocks_per_req)
 
         if block_sizes != [self.cache_config.block_size] or self.kernel_block_sizes != [[self.cache_config.block_size]]:
-            assert self.cache_config.cpu_offload_gb == 0, (
+            assert self.offload_config.uva.cpu_offload_gb == 0, (
                 "Cannot re-initialize the input batch when CPU weight "
                 "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 "  # noqa: E501
                 "for more details."