diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 5af35eec2e9..10dbcaefabc 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=v0.19.0 +ARG VLLM_COMMIT=5af684c31912232e5c89484c2e8259e0fac6c55b RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index c7d914fbf1c..894c8f3e589 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [v0.19.0] + vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 93dd04a1061..93907052555 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: v0.19.0 + vllm: 5af684c31912232e5c89484c2e8259e0fac6c55b changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -91,7 +91,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [v0.19.0] + vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -103,7 +103,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [v0.19.0] + vllm_version: [5af684c31912232e5c89484c2e8259e0fac6c55b, v0.19.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. @@ -116,4 +116,4 @@ jobs: type: light secrets: HW_OBS_AK: ${{ secrets.HW_OBS_AK }} - HW_OBS_SK: ${{ secrets.HW_OBS_SK }} \ No newline at end of file + HW_OBS_SK: ${{ secrets.HW_OBS_SK }} diff --git a/Dockerfile b/Dockerfile index 733c0668674..4409714cd82 100644 --- a/Dockerfile +++ b/Dockerfile @@ -48,10 +48,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p b/Dockerfile.310p index f0a44c6241c..1f53946d9b1 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -33,10 +33,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 207186e6bf1..f1152c444e5 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -32,10 +32,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 9304bc516af..c9ce6316de4 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -50,10 +50,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 0f2ce692da2..096cb0e4615 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 48c91e43567..10266533a35 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -49,10 +49,8 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \ # Install vLLM ARG VLLM_REPO=https://github.com/vllm-project/vllm.git -ARG VLLM_COMMIT=v0.19.0 -RUN git init /vllm-workspace/vllm && \ - git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ - git -C /vllm-workspace/vllm checkout FETCH_HEAD +ARG VLLM_TAG=v0.19.0 +RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm # In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it. RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \ python3 -m pip uninstall -y triton && \ diff --git a/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py b/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py index 37a2a8c7bbf..fa9304d6422 100644 --- a/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py +++ b/tests/ut/patch/worker/patch_common/test_patch_gdn_attn.py @@ -8,11 +8,23 @@ import vllm_ascend.patch.worker.patch_gdn_attn as patch_gdn_attn from vllm.config.compilation import CUDAGraphMode +from vllm.model_executor.layers.fla.ops import index as _fla_index from vllm.v1.attention.backend import CommonAttentionMetadata from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder from vllm.v1.kv_cache_interface import MambaSpec +@pytest.fixture(autouse=True) +def _patch_triton_cdiv(monkeypatch): + if not hasattr(_fla_index.triton, "cdiv"): + monkeypatch.setattr( + _fla_index.triton, + "cdiv", + lambda a, b: (a + b - 1) // b, + raising=False, + ) + + @dataclass class BatchSpec: seq_lens: list[int] diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 6ff3fb15be2..6534bce8bfc 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -27,6 +27,7 @@ from vllm_ascend.ops.fused_moe.moe_comm_method import FusedExpertsResult, _MoECommMethods from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input from vllm_ascend.quantization.quant_type import QuantType +from vllm_ascend.utils import vllm_version_is from .experts_selector import select_experts from .moe_comm_method import AllGatherCommImpl310 @@ -36,6 +37,10 @@ class AscendUnquantizedFusedMoEMethod310(UnquantizedFusedMoEMethod): def __init__(self, moe: FusedMoEConfig = None): super().__init__(moe=moe) + @property + def is_monolithic(self) -> bool: + return False + def process_weights_after_loading(self, layer): super().process_weights_after_loading(layer) @@ -156,21 +161,20 @@ def __init__(self, *args, **kwargs): self.quant_type = self.get_quant_type() _MoECommMethods[MoECommType.ALLGATHER] = AllGatherCommImpl310(self.moe_config) - self.runner = self._init_runner() - def _init_runner(self): from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - return AscendMoERunner( - layer=self, - moe_config=self.moe_config, - router=self.router, - routed_input_transform=self._routed_input_transform, - gate=self.gate, - shared_experts=self.shared_experts, - quant_method=self.quant_method, - reduce_results=self.reduce_results, - enable_dbo=self.vllm_config.parallel_config.enable_dbo, + is_legacy = vllm_version_is("0.19.0") + self.runner = AscendMoERunner( + self if is_legacy else self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate if is_legacy else kwargs.pop("gate", None), + self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, ) def init_experts_map(self, moe_config): @@ -276,7 +280,23 @@ def __init__( self._gate = gate # Recreate runner after shared_experts/gate are set so custom op dispatch # goes through moe_forward_shared. - self.runner = self._init_runner() + # NOTE: must use self._shared_experts here, not self.shared_experts — + # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, + # which at this point is still the stale runner built with shared_experts=None. + from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner + + is_legacy = vllm_version_is("0.19.0") + self.runner = AscendMoERunner( + self if is_legacy else self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self._shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) @property def is_internal_router(self) -> bool: @@ -288,20 +308,16 @@ def forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: - if self._shared_experts is None: - fused_out = AscendFusedMoE310.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - shared_out = None - return shared_out, fused_out - shared_out, fused_out = AscendFusedMoE310.forward( + result = AscendFusedMoE310.forward( self, hidden_states=hidden_states, router_logits=router_logits, ) - return shared_out, fused_out + # When shared experts are absent, the parent returns only fused_out; + # otherwise it returns a (shared_out, fused_out) tuple. + if self._shared_experts is None: + return None, result + return result def _forward_shared_experts(self, hidden_states: torch.Tensor): if self._shared_experts is None: diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 9bb05b36ee5..53651ddf912 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -27,10 +27,8 @@ from vllm.forward_context import get_forward_context from vllm.logger import logger from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig -from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer -from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE @@ -51,6 +49,7 @@ npu_stream_switch, shared_expert_dp_enabled, shared_experts_calculation_stream, + vllm_version_is, ) @@ -81,6 +80,10 @@ def __init__(self, moe: FusedMoEConfig = None): super().__init__(moe=moe) self.dynamic_eplb = get_ascend_config().eplb_config.dynamic_eplb + @property + def is_monolithic(self) -> bool: + return False + def process_weights_after_loading(self, layer): super(UnquantizedFusedMoEMethod, self).process_weights_after_loading(layer) @@ -219,68 +222,20 @@ def apply( # Please remove this inheritance after extending vllm, todo(wxs) class AscendMoERunner(DefaultMoERunner): - """ - Default implementation of the MoE runner for executing Mixture of Experts layers. - - This class provides a comprehensive implementation for running MoE computations - with support for: - - Expert routing and token dispatching - - Shared experts computation with optional parallel execution using CUDA streams - - Data parallel (DP) chunking for large batch processing - - Tensor model parallel and expert parallel operations - - Various quantization methods and custom operators - - Both monolithic and decomposed expert execution paths - - The runner handles the complete MoE forward pass including routing tokens to - experts, executing expert computations, and combining results. It supports - advanced features like overlapped execution of shared experts and optimized - kernels for different parallel execution modes. - - Eventually, this class will be split up and specialized for different - configurations, e.g. the presence or absence of shared experts, a gate, etc. - """ - - def __init__( - self, - layer: torch.nn.Module, - moe_config: FusedMoEConfig, - router: FusedMoERouter, - routed_input_transform: torch.nn.Module | None, - gate: torch.nn.Module | None, - shared_experts: torch.nn.Module | None, - quant_method: FusedMoEMethodBase, - reduce_results: bool, - enable_dbo: bool, - ): - super().__init__( - layer, - moe_config, - router, - routed_input_transform, - gate, - shared_experts, - quant_method, - reduce_results, - enable_dbo, - ) - if self.shared_experts is None: - self.moe_forward = torch.ops.vllm.moe_forward - else: - self.moe_forward = torch.ops.vllm.moe_forward_shared - @property def use_dp_chunking(self) -> bool: """Ascend uses its own forward_impl path, not the FlashInfer Cutlass chunked path. Always return False to stay on forward_impl.""" return False + # TODO: Remove this after drop v0.19.0 support def forward_impl( self, layer: torch.nn.Module, hidden_states: torch.Tensor, router_logits: torch.Tensor, shared_input: torch.Tensor | None, - ): + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ Override the default forward_impl to use Ascend-specific implementation. This delegates to the layer's forward_impl method which contains the @@ -292,6 +247,21 @@ def forward_impl( # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared return result + def forward_dispatch( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + with self._sequence_parallel_context(): + return self.forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + class AscendFusedMoE(FusedMoE): moe_counter = -1 @@ -386,22 +356,18 @@ def __init__(self, *args, **kwargs): setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() - self.runner = self._init_runner() - def _init_runner(self): - # Storing the runner in the FusedMoE is an intermediate state, eventually - # the runner will own the FusedMoE layer and provide the execution interface - # for MoE ops. - return AscendMoERunner( - layer=self, - moe_config=self.moe_config, - router=self.router, - routed_input_transform=self._routed_input_transform, - gate=self.gate, - shared_experts=self.shared_experts, - quant_method=self.quant_method, - reduce_results=self.reduce_results, - enable_dbo=self.vllm_config.parallel_config.enable_dbo, + is_legacy = vllm_version_is("0.19.0") + self.runner = AscendMoERunner( + self if is_legacy else self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate if is_legacy else kwargs.pop("gate", None), + self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, ) def _get_quant_type(self) -> QuantType: @@ -605,17 +571,30 @@ def __init__( self._shared_experts = shared_experts self.use_overlapped = use_overlapped self.shared_expert_stream = None - self.multistream_overlap_shared_expert = ( - ascend_config.multistream_overlap_shared_expert and self._shared_experts is not None - ) - self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and self._shared_experts is not None + has_shared_experts = shared_experts is not None + self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert and has_shared_experts + self.multistream_overlap_gate = ascend_config.multistream_overlap_gate and has_shared_experts if enable_sp(): logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") self._gate = gate - # Recreate the runner with the correct shared_experts parameter - # The parent class created the runner before self._shared_experts was set - self.runner = self._init_runner() + # Recreate the runner with the correct shared_experts parameter. + # The parent class created the runner before self._shared_experts was set. + # NOTE: must use self._shared_experts here, not self.shared_experts — + # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, + # which at this point is still the stale runner built with shared_experts=None. + is_legacy = vllm_version_is("0.19.0") + self.runner = AscendMoERunner( + self if is_legacy else self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self._shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) if self.multistream_overlap_shared_expert: # Wrap the quant_method's process_weights_after_loading to validate that @@ -690,20 +669,16 @@ def forward( hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: - if self._shared_experts is None: - fused_out = AscendFusedMoE.forward( - self, - hidden_states=hidden_states, - router_logits=router_logits, - ) - shared_out = None - return shared_out, fused_out - shared_out, fused_out = AscendFusedMoE.forward( + result = AscendFusedMoE.forward( self, hidden_states=hidden_states, router_logits=router_logits, ) - return shared_out, fused_out + # When shared experts are absent, the parent returns only fused_out; + # otherwise it returns a (shared_out, fused_out) tuple. + if self._shared_experts is None: + return None, result + return result def _forward_shared_experts(self, hidden_states: torch.Tensor, fused_moe_evts: FusedMoEEvents): if self._shared_experts is None: diff --git a/vllm_ascend/patch/worker/patch_qwen3vl.py b/vllm_ascend/patch/worker/patch_qwen3vl.py index 103e3d42077..8c03845672a 100644 --- a/vllm_ascend/patch/worker/patch_qwen3vl.py +++ b/vllm_ascend/patch/worker/patch_qwen3vl.py @@ -2,10 +2,14 @@ from vllm.distributed import get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size from vllm.model_executor.models.qwen3 import Qwen3Attention from vllm.model_executor.models.qwen3_moe import Qwen3MoeAttention -from vllm.model_executor.models.qwen3_vl import Qwen3VLForConditionalGeneration +from vllm.model_executor.models.qwen3_vl import ( + Qwen3_VisionTransformer, + Qwen3VLForConditionalGeneration, +) from vllm_ascend.ascend_forward_context import _EXTRA_CTX from vllm_ascend.ops.rotary_embedding import AscendMRotaryEmbedding +from vllm_ascend.utils import vllm_version_is def tensor_parallel_wrap(func): @@ -68,3 +72,25 @@ def forward_with_split_qkv_rmsnorm_mrope(self, positions: torch.Tensor, hidden_s Qwen3VLForConditionalGeneration._get_deepstack_input_embeds = tensor_parallel_wrap( Qwen3VLForConditionalGeneration._get_deepstack_input_embeds ) + +if not vllm_version_is("0.19.0"): + # Only patch for latest main + from vllm.model_executor.models.qwen3_vl import pos_embed_interpolate_native + + def _fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: + outputs = [] + for t, h, w in grid_thw: + outputs.append( + pos_embed_interpolate_native( + self.pos_embed.weight, + t, + h, + w, + self.num_grid_per_side, + self.spatial_merge_size, + self.dtype, + ) + ) + return torch.cat(outputs, dim=0) + + Qwen3_VisionTransformer.fast_pos_embed_interpolate = _fast_pos_embed_interpolate diff --git a/vllm_ascend/worker/npu_input_batch.py b/vllm_ascend/worker/npu_input_batch.py index 98dfafa92b5..8600465ad0d 100644 --- a/vllm_ascend/worker/npu_input_batch.py +++ b/vllm_ascend/worker/npu_input_batch.py @@ -183,6 +183,10 @@ def __init__( # To accumulate prompt logprobs tensor chunks across prefill steps. self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {} + # req_id -> list of specific token IDs to compute logprobs for + # More efficient than num_logprobs=-1 when only a few tokens are needed + self.logprob_token_ids: dict[str, list[int]] = {} + # Internal representation of per-step batch state changes, used for # reordering persistent batch and generating logitsprocs batch state # updates. Should reset each step. diff --git a/vllm_ascend/worker/v2/README.md b/vllm_ascend/worker/v2/README.md index 1ba4b3611a6..29a1adcf56b 100644 --- a/vllm_ascend/worker/v2/README.md +++ b/vllm_ascend/worker/v2/README.md @@ -5,5 +5,5 @@ This directory contains the new model runner which is under active development. please see [Model Runner V2](https://github.com/vllm-project/vllm-ascend/issues/5208) to get specific plans. -supported vllm version: main@v0.19.0 +supported vllm version: main@5af684c31912232e5c89484c2e8259e0fac6c55b related PR: