vllm-project · wangxiyuan · Dec 23, 2025 · Dec 3, 2025 · Dec 9, 2025 · Dec 9, 2025
@@ -104,6 +104,6 @@ def forward_oot(
             context_layer = context_layer[..., :origin_shape]
 
         context_layer = einops.rearrange(context_layer,
-                                         "(b s) h d -> s b (h d)",
+                                         "(b s) h d -> b s h d",
                                          b=bsz).contiguous()
         return context_layer
@@ -18,12 +18,14 @@
 import math
 from typing import Optional, Tuple
 
+import einops
 import torch
 import torch_npu
 from vllm.config import CUDAGraphMode
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
     YaRNScalingRotaryEmbedding)
+from vllm.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb
 
 from vllm_ascend.platform import NPUPlatform
 from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
@@ -524,3 +526,59 @@ def forward_oot(
                                          rotary_mode='half')
 
         return query, key
+
+
+class AscendApplyRotaryEmb(ApplyRotaryEmb):
+
+    def __init__(
+        self,
+        enforce_enable: bool = False,
+        is_neox_style: bool = True,
+        enable_fp32_compute: bool = False,
+    ) -> None:
+        super().__init__(
+            enforce_enable=enforce_enable,
+            is_neox_style=is_neox_style,
+            enable_fp32_compute=enable_fp32_compute,
+        )
+
+    def forward_oot(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        head_dim = x.shape[-1]
+
+        origin_dtype = x.dtype
+        if self.enable_fp32_compute:
+            x = x.float()
+            cos = cos.float()
+            sin = sin.float()
+
+        # cos, sin: [seq_len, head_dim // 2]
+        cos = torch.cat((cos, cos), dim=-1)
+        sin = torch.cat((sin, sin), dim=-1)
+        # cos, sin: [1, seq_len, 1, head_dim]
+        cos = cos.reshape(1, -1, 1, head_dim)
+        sin = sin.reshape(1, -1, 1, head_dim)
+
+        if len(x.shape) == 3:
+            # x: [seq_len, num_heads, head_size]
+            x = x.unsqueeze(0)
+            # x: [1, seq_len, num_heads, head_size]
+            output = torch_npu.npu_rotary_mul(x, cos, sin).squeeze(0)
+        else:
+            assert len(x.shape) == 4
+            # x: [2 * b, s, head, head_dim]
+            qk = einops.rearrange(
+                x, "(two b) s head head_dim -> b s two head head_dim", two=2)
+            # q, k: [b, s, head, head_dim]
+            q, k = qk[:, :, 0], qk[:, :, 1]
+            q = torch_npu.npu_rotary_mul(q, cos, sin)
+            k = torch_npu.npu_rotary_mul(k, cos, sin)
+            output = torch.cat([q, k], dim=0)
+
+        if self.enable_fp32_compute:
+            output = output.to(origin_dtype)
+        return output
@@ -146,53 +146,7 @@
 #    Future Plan:
 #       Identify this pattern in torch-npu and remove this patch.
 #
-# ** 5. File: worker/patch_qwen2_5_omni.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.models.qwen2_5_omni_thinker.Qwen2_5OmniThinkerForConditionalGeneration`
-#    Why:
-#       we have ascend forward context which doesn't work with upstream.
-#    How：
-#       override forward_context in the model file
-#    Related PR (if no, explain why):
-#       This is a bug by Ascend only. we should drop set_ascend_forward_context
-#    Future Plan:
-#       Remove this patch once forward_context is refactor.
-#
-# ** 6. File: worker/patch_qwen2_5_vl.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration`
-#    Why:
-#       we have ascend forward context which doesn't work with upstream.
-#    How：
-#       override forward_context in the model file
-#    Related PR (if no, explain why):
-#       This is a bug by Ascend only. we should drop set_ascend_forward_context
-#    Future Plan:
-#       Remove this patch once forward_context is refactor.
-#
-#   2. `vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.forward`
-#    Why:
-#       the attention is not custom ops
-#    How：
-#       make it to custom ops and pluggable
-#    Related PR (if no, explain why):
-#       https://github.com/vllm-project/vllm/pull/30125
-#    Future Plan:
-#       Remove this patch one the PR is merged into vLLM.
-#
-# ** 7. File: worker/patch_qwen3_vl.py**
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-#   1. `vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward`
-#    Why:
-#       the attention is not custom ops
-#    How：
-#       make it to custom ops and pluggable
-#    Related PR (if no, explain why):
-#       https://github.com/vllm-project/vllm/pull/30125
-#    Future Plan:
-#       Remove this patch one the PR is merged into vLLM.
-#
-# ** 8. File: worker/patch_roberta.py **
+# ** 5. File: worker/patch_roberta.py **
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.bert `
 #    Why:
@@ -204,7 +158,7 @@
 #    Future Plan:
 #       Revert this when CANN support shift aclnn operation
 #
-# ** 9. File: worker/patch_triton.py**
+# ** 6. File: worker/patch_triton.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.layers.mamba.ops`, `vllm.model_executor.layers.fla.ops`
 #    Why:
@@ -216,7 +170,7 @@
 #    Future Plan:
 #       Remove this patch when vLLM support the dispatch function.
 #
-# ** 10. File: worker/patch_weight_loader.py**
+# ** 7. File: worker/patch_weight_loader.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod`
 #    Why:
@@ -228,7 +182,7 @@
 #    Future Plan:
 #       Remove this patch when the bug is fixed.
 #
-# ** 11. File: worker/patch_qwen3_next_mtp.py**
+# ** 8. File: worker/patch_qwen3_next_mtp.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.worker.utils.bind_kv_cache`
 #    Why:
@@ -241,7 +195,7 @@
 #    Future Plan:
 #       Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu.
 #
-# ** 12. File: worker/patch_module.py**
+# ** 9. File: worker/patch_module.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.attention.backends.gdn_attn.torch.argsort`
 #    Why:
@@ -257,7 +211,7 @@
 #       Remove this patch when bool is supported in 'torch.argsort' func of npu.
 #       Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable.
 #
-# ** 13. File: worker/patch_rejection_sampler.py**
+# ** 10. File: worker/patch_rejection_sampler.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.v1.sample.rejection_sampler`
 #    Why:
@@ -273,7 +227,7 @@
 #           to override them, then delete the patch file `worker/patch_rejection_sampler.py`.
 #       2. make these functions as costom op, then remove AscendRejectionSampler
 #
-# ** 14.File: worker/patch_qwen3_next.py**
+# ** 11.File: worker/patch_qwen3_next.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward`
 #    Why:
@@ -285,7 +239,7 @@
 #    Future Plan:
 #       Remove this patch when vLLM support these operators.
 #
-# ** 15. File: worker/patch_qwen3_next.py**
+# ** 12. File: worker/patch_qwen3_next.py**
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #   1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core`
 #    Why:

@@ -28,8 +28,6 @@
 import vllm_ascend.patch.worker.patch_weight_loader  # noqa
 import vllm_ascend.patch.worker.patch_multimodal_merge  # noqa
 import vllm_ascend.patch.worker.patch_minicpm  # noqa
-import vllm_ascend.patch.worker.patch_qwen2_5_vl  # noqa
-import vllm_ascend.patch.worker.patch_qwen2_5_omni  # noqa
 import vllm_ascend.patch.worker.patch_rope  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next  # noqa
 import vllm_ascend.patch.worker.patch_qwen3_next_mtp  # noqa