diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint
index 5af35eec2e9..10dbcaefabc 100644
--- a/.github/workflows/dockerfiles/Dockerfile.lint
+++ b/.github/workflows/dockerfiles/Dockerfile.lint
@@ -27,7 +27,7 @@ RUN apt-get update -y && \
 
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
 # For lint purpose, actually we need make a main2main matching.
-ARG VLLM_COMMIT=v0.19.0
+ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b
 RUN git init /vllm-workspace/vllm && \
     git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
     git -C /vllm-workspace/vllm checkout FETCH_HEAD
diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml
index c7d914fbf1c..894c8f3e589 100644
--- a/.github/workflows/pr_test_full.yaml
+++ b/.github/workflows/pr_test_full.yaml
@@ -80,7 +80,7 @@ jobs:
     name: e2e-full
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     needs: [changes]
     if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
     uses: ./.github/workflows/_e2e_test.yaml
diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml
index 93dd04a1061..93907052555 100644
--- a/.github/workflows/pr_test_light.yaml
+++ b/.github/workflows/pr_test_light.yaml
@@ -41,7 +41,7 @@ jobs:
   lint:
     uses: ./.github/workflows/_pre_commit.yml
     with:
-      vllm: v0.19.0
+      vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b
   changes:
     runs-on: linux-aarch64-a2b3-0
     outputs:
@@ -91,7 +91,7 @@ jobs:
     if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     uses: ./.github/workflows/_unit_test.yaml
     with:
       vllm: ${{ matrix.vllm_version }}
@@ -103,7 +103,7 @@ jobs:
     name: e2e-light
     strategy:
       matrix:
-        vllm_version: [v0.19.0]
+        vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0]
     # Note (yikun): If CI resource are limited we can split job into two chain jobs
     needs: [lint, changes]
     # only trigger e2e test after lint passed and the change is e2e related with pull request.
@@ -116,4 +116,4 @@ jobs:
       type: light
     secrets:
       HW_OBS_AK: ${{ secrets.HW_OBS_AK }}
-      HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
\ No newline at end of file
+      HW_OBS_SK: ${{ secrets.HW_OBS_SK }}
diff --git a/Dockerfile b/Dockerfile
index 733c0668674..4409714cd82 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/Dockerfile.310p b/Dockerfile.310p
index f0a44c6241c..1f53946d9b1 100644
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@@ -33,10 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler
index 207186e6bf1..f1152c444e5 100644
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@@ -32,10 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/Dockerfile.a3 b/Dockerfile.a3
index 9304bc516af..c9ce6316de4 100644
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@@ -50,10 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler
index 0f2ce692da2..096cb0e4615 100644
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler
index 48c91e43567..10266533a35 100644
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \
 
 # Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-ARG VLLM_COMMIT=v0.19.0
-RUN git init /vllm-workspace/vllm && \
-    git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
-    git -C /vllm-workspace/vllm checkout FETCH_HEAD
+ARG VLLM_TAG=v0.19.0
+RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
 # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
 RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
     python3 -m pip uninstall -y triton && \
diff --git a/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py b/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py
index 37a2a8c7bbf..fa9304d6422 100644
--- a/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py
+++ b/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py
@@ -8,11 +8,23 @@
 
 import vllm_ascend.patch.worker.patch_gdn_attn as patch_gdn_attn
 from vllm.config.compilation import CUDAGraphMode
+from vllm.model_executor.layers.fla.ops import index as _fla_index
 from vllm.v1.attention.backend import CommonAttentionMetadata
 from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
 from vllm.v1.kv_cache_interface import MambaSpec
 
 
+@pytest.fixture(autouse=True)
+def _patch_triton_cdiv(monkeypatch):
+    if not hasattr(_fla_index.triton, "cdiv"):
+        monkeypatch.setattr(
+            _fla_index.triton,
+            "cdiv",
+            lambda a, b: (a + b - 1) // b,
+            raising=False,
+        )
+
+
 @dataclass
 class BatchSpec:
     seq_lens: list[int]
diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py
index 6ff3fb15be2..6534bce8bfc 100644
--- a/vllm_ascend/_310p/fused_moe/fused_moe.py
+++ b/vllm_ascend/_310p/fused_moe/fused_moe.py
@@ -27,6 +27,7 @@
 from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods
 from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
 from vllm_ascend.quantization.quant_type import QuantType
+from vllm_ascend.utils import vllm_version_is
 
 from .experts_selector import select_experts
 from .moe_comm_method import AllGatherCommImpl310
@@ -36,6 +37,10 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod):
     def __init__(self, moe: FusedMoEConfig = None):
         super().__init__(moe=moe)
 
+    @property
+    def is_monolithic(self) -> bool:
+        return False
+
     def process_weights_after_loading(self, layer):
         super().process_weights_after_loading(layer)
 
@@ -156,21 +161,20 @@ def __init__(self, *args, **kwargs):
         self.quant_type = self.get_quant_type()
 
         _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config)
-        self.runner = self._init_runner()
 
-    def _init_runner(self):
         from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
 
-        return AscendMoERunner(
-            layer=self,
-            moe_config=self.moe_config,
-            router=self.router,
-            routed_input_transform=self._routed_input_transform,
-            gate=self.gate,
-            shared_experts=self.shared_experts,
-            quant_method=self.quant_method,
-            reduce_results=self.reduce_results,
-            enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate if is_legacy else kwargs.pop("gate", None),
+            self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
         )
 
     def init_experts_map(self, moe_config):
@@ -276,7 +280,23 @@ def __init__(
         self._gate = gate
         # Recreate runner after shared_experts/gate are set so custom op dispatch
         # goes through moe_forward_shared.
-        self.runner = self._init_runner()
+        # NOTE: must use self._shared_experts here, not self.shared_experts —
+        # FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
+        # which at this point is still the stale runner built with shared_experts=None.
+        from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner
+
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate,
+            self._shared_experts,
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
+        )
 
     @property
     def is_internal_router(self) -> bool:
@@ -288,20 +308,16 @@ def forward(
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self._shared_experts is None:
-            fused_out = AscendFusedMoE310.forward(
-                self,
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
-            shared_out = None
-            return shared_out, fused_out
-        shared_out, fused_out = AscendFusedMoE310.forward(
+        result = AscendFusedMoE310.forward(
             self,
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
-        return shared_out, fused_out
+        # When shared experts are absent, the parent returns only fused_out;
+        # otherwise it returns a (shared_out, fused_out) tuple.
+        if self._shared_experts is None:
+            return None, result
+        return result
 
     def _forward_shared_experts(self, hidden_states: torch.Tensor):
         if self._shared_experts is None:
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
index 9bb05b36ee5..53651ddf912 100644
--- a/vllm_ascend/ops/fused_moe/fused_moe.py
+++ b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -27,10 +27,8 @@
 from vllm.forward_context import get_forward_context
 from vllm.logger import logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
-from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase  # type: ignore
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
 from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
-from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter  # type: ignore
 from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner  # type: ignore
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
 
@@ -51,6 +49,7 @@
     npu_stream_switch,
     shared_expert_dp_enabled,
     shared_experts_calculation_stream,
+    vllm_version_is,
 )
 
 
@@ -81,6 +80,10 @@ def __init__(self, moe: FusedMoEConfig = None):
         super().__init__(moe=moe)
         self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb
 
+    @property
+    def is_monolithic(self) -> bool:
+        return False
+
     def process_weights_after_loading(self, layer):
         super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer)
 
@@ -219,68 +222,20 @@ def apply(
 
 # Please remove this inheritance after extending vllm, todo(wxs)
 class AscendMoERunner(DefaultMoERunner):
-    """
-    Default implementation of the MoE runner for executing Mixture of Experts layers.
-
-    This class provides a comprehensive implementation for running MoE computations
-    with support for:
-    - Expert routing and token dispatching
-    - Shared experts computation with optional parallel execution using CUDA streams
-    - Data parallel (DP) chunking for large batch processing
-    - Tensor model parallel and expert parallel operations
-    - Various quantization methods and custom operators
-    - Both monolithic and decomposed expert execution paths
-
-    The runner handles the complete MoE forward pass including routing tokens to
-    experts, executing expert computations, and combining results. It supports
-    advanced features like overlapped execution of shared experts and optimized
-    kernels for different parallel execution modes.
-
-    Eventually, this class will be split up and specialized for different
-    configurations, e.g. the presence or absence of shared experts, a gate, etc.
-    """
-
-    def __init__(
-        self,
-        layer: torch.nn.Module,
-        moe_config: FusedMoEConfig,
-        router: FusedMoERouter,
-        routed_input_transform: torch.nn.Module | None,
-        gate: torch.nn.Module | None,
-        shared_experts: torch.nn.Module | None,
-        quant_method: FusedMoEMethodBase,
-        reduce_results: bool,
-        enable_dbo: bool,
-    ):
-        super().__init__(
-            layer,
-            moe_config,
-            router,
-            routed_input_transform,
-            gate,
-            shared_experts,
-            quant_method,
-            reduce_results,
-            enable_dbo,
-        )
-        if self.shared_experts is None:
-            self.moe_forward = torch.ops.vllm.moe_forward
-        else:
-            self.moe_forward = torch.ops.vllm.moe_forward_shared
-
     @property
     def use_dp_chunking(self) -> bool:
         """Ascend uses its own forward_impl path, not the FlashInfer Cutlass
         chunked path. Always return False to stay on forward_impl."""
         return False
 
+    # TODO: Remove this after drop v0.19.0 support
     def forward_impl(
         self,
         layer: torch.nn.Module,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
         shared_input: torch.Tensor | None,
-    ):
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         Override the default forward_impl to use Ascend-specific implementation.
         This delegates to the layer's forward_impl method which contains the
@@ -292,6 +247,21 @@ def forward_impl(
         # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
         return result
 
+    def forward_dispatch(
+        self,
+        layer: torch.nn.Module,
+        hidden_states: torch.Tensor,
+        router_logits: torch.Tensor,
+        shared_experts_input: torch.Tensor | None,
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
+        with self._sequence_parallel_context():
+            return self.forward_impl(
+                layer,
+                hidden_states,
+                router_logits,
+                shared_experts_input,
+            )
+
 
 class AscendFusedMoE(FusedMoE):
     moe_counter = -1
@@ -386,22 +356,18 @@ def __init__(self, *args, **kwargs):
 
         setup_moe_comm_method(self.moe_config)
         self.quant_type = self._get_quant_type()
-        self.runner = self._init_runner()
 
-    def _init_runner(self):
-        # Storing the runner in the FusedMoE is an intermediate state, eventually
-        # the runner will own the FusedMoE layer and provide the execution interface
-        # for MoE ops.
-        return AscendMoERunner(
-            layer=self,
-            moe_config=self.moe_config,
-            router=self.router,
-            routed_input_transform=self._routed_input_transform,
-            gate=self.gate,
-            shared_experts=self.shared_experts,
-            quant_method=self.quant_method,
-            reduce_results=self.reduce_results,
-            enable_dbo=self.vllm_config.parallel_config.enable_dbo,
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate if is_legacy else kwargs.pop("gate", None),
+            self.shared_experts if is_legacy else kwargs.pop("shared_experts", None),
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
         )
 
     def _get_quant_type(self) -> QuantType:
@@ -605,17 +571,30 @@ def __init__(
         self._shared_experts = shared_experts
         self.use_overlapped = use_overlapped
         self.shared_expert_stream = None
-        self.multistream_overlap_shared_expert = (
-            ascend_config.multistream_overlap_shared_expert and self._shared_experts is not None
-        )
-        self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and self._shared_experts is not None
+        has_shared_experts = shared_experts is not None
+        self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts
+        self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts
         if enable_sp():
             logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")
 
         self._gate = gate
-        # Recreate the runner with the correct shared_experts parameter
-        # The parent class created the runner before self._shared_experts was set
-        self.runner = self._init_runner()
+        # Recreate the runner with the correct shared_experts parameter.
+        # The parent class created the runner before self._shared_experts was set.
+        # NOTE: must use self._shared_experts here, not self.shared_experts —
+        # FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
+        # which at this point is still the stale runner built with shared_experts=None.
+        is_legacy = vllm_version_is("0.19.0")
+        self.runner = AscendMoERunner(
+            self if is_legacy else self.layer_name,
+            self.moe_config,
+            self.router,
+            self._routed_input_transform,
+            self.gate,
+            self._shared_experts,
+            self.quant_method,
+            self.reduce_results,
+            self.vllm_config.parallel_config.enable_dbo,
+        )
 
         if self.multistream_overlap_shared_expert:
             # Wrap the quant_method's process_weights_after_loading to validate that
@@ -690,20 +669,16 @@ def forward(
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        if self._shared_experts is None:
-            fused_out = AscendFusedMoE.forward(
-                self,
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
-            shared_out = None
-            return shared_out, fused_out
-        shared_out, fused_out = AscendFusedMoE.forward(
+        result = AscendFusedMoE.forward(
             self,
             hidden_states=hidden_states,
             router_logits=router_logits,
         )
-        return shared_out, fused_out
+        # When shared experts are absent, the parent returns only fused_out;
+        # otherwise it returns a (shared_out, fused_out) tuple.
+        if self._shared_experts is None:
+            return None, result
+        return result
 
     def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents):
         if self._shared_experts is None:
diff --git a/vllm_ascend/patch/worker/patch_qwen3vl.py b/vllm_ascend/patch/worker/patch_qwen3vl.py
index 103e3d42077..8c03845672a 100644
--- a/vllm_ascend/patch/worker/patch_qwen3vl.py
+++ b/vllm_ascend/patch/worker/patch_qwen3vl.py
@@ -2,10 +2,14 @@
 from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size
 from vllm.model_executor.models.qwen3 import Qwen3Attention
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeAttention
-from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration
+from vllm.model_executor.models.qwen3_vl import (
+    Qwen3_VisionTransformer,
+    Qwen3VLForConditionalGeneration,
+)
 
 from vllm_ascend.ascend_forward_context import _EXTRA_CTX
 from vllm_ascend.ops.rotary_embedding import AscendMRotaryEmbedding
+from vllm_ascend.utils import vllm_version_is
 
 
 def tensor_parallel_wrap(func):
@@ -68,3 +72,25 @@ def forward_with_split_qkv_rmsnorm_mrope(self, positions: torch.Tensor, hidden_s
 Qwen3VLForConditionalGeneration._get_deepstack_input_embeds = tensor_parallel_wrap(
     Qwen3VLForConditionalGeneration._get_deepstack_input_embeds
 )
+
+if not vllm_version_is("0.19.0"):
+    # Only patch for latest main
+    from vllm.model_executor.models.qwen3_vl import pos_embed_interpolate_native
+
+    def _fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
+        outputs = []
+        for t, h, w in grid_thw:
+            outputs.append(
+                pos_embed_interpolate_native(
+                    self.pos_embed.weight,
+                    t,
+                    h,
+                    w,
+                    self.num_grid_per_side,
+                    self.spatial_merge_size,
+                    self.dtype,
+                )
+            )
+        return torch.cat(outputs, dim=0)
+
+    Qwen3_VisionTransformer.fast_pos_embed_interpolate = _fast_pos_embed_interpolate
diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py
index 98dfafa92b5..8600465ad0d 100644
--- a/vllm_ascend/worker/npu_input_batch.py
+++ b/vllm_ascend/worker/npu_input_batch.py
@@ -183,6 +183,10 @@ def __init__(
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
 
+        # req_id -> list of specific token IDs to compute logprobs for
+        # More efficient than num_logprobs=-1 when only a few tokens are needed
+        self.logprob_token_ids: dict[str, list[int]] = {}
+
         # Internal representation of per-step batch state changes, used for
         # reordering persistent batch and generating logitsprocs batch state
         # updates. Should reset each step.
diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md
index 1ba4b3611a6..29a1adcf56b 100644
--- a/vllm_ascend/worker/v2/README.md
+++ b/vllm_ascend/worker/v2/README.md
@@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development.
 please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208)
 to get specific plans.
 
-supported vllm version: main@v0.19.0
+supported vllm version: main@5af684c31912232e5c89484c2e8259e0fac6c55b
 related PR: <https://github.com/vllm-project/vllm-ascend/pull/7709>