Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vllm_ascend/ops/mm_encoder_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,6 @@ def forward_oot(
context_layer = context_layer[..., :origin_shape]

context_layer = einops.rearrange(context_layer,
"(b s) h d -> s b (h d)",
"(b s) h d -> b s h d",
b=bsz).contiguous()
return context_layer
58 changes: 58 additions & 0 deletions vllm_ascend/ops/rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
import math
from typing import Optional, Tuple

import einops
import torch
import torch_npu
from vllm.config import CUDAGraphMode
from vllm.model_executor.layers.rotary_embedding import (
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
YaRNScalingRotaryEmbedding)
from vllm.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb

from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
Expand Down Expand Up @@ -524,3 +526,59 @@ def forward_oot(
rotary_mode='half')

return query, key


class AscendApplyRotaryEmb(ApplyRotaryEmb):

def __init__(
self,
enforce_enable: bool = False,
is_neox_style: bool = True,
enable_fp32_compute: bool = False,
) -> None:
super().__init__(
enforce_enable=enforce_enable,
is_neox_style=is_neox_style,
enable_fp32_compute=enable_fp32_compute,
)

def forward_oot(
self,
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
head_dim = x.shape[-1]

origin_dtype = x.dtype
if self.enable_fp32_compute:
x = x.float()
cos = cos.float()
sin = sin.float()

# cos, sin: [seq_len, head_dim // 2]
cos = torch.cat((cos, cos), dim=-1)
sin = torch.cat((sin, sin), dim=-1)
# cos, sin: [1, seq_len, 1, head_dim]
cos = cos.reshape(1, -1, 1, head_dim)
sin = sin.reshape(1, -1, 1, head_dim)

if len(x.shape) == 3:
# x: [seq_len, num_heads, head_size]
x = x.unsqueeze(0)
# x: [1, seq_len, num_heads, head_size]
output = torch_npu.npu_rotary_mul(x, cos, sin).squeeze(0)
else:
assert len(x.shape) == 4
# x: [2 * b, s, head, head_dim]
qk = einops.rearrange(
x, "(two b) s head head_dim -> b s two head head_dim", two=2)
# q, k: [b, s, head, head_dim]
q, k = qk[:, :, 0], qk[:, :, 1]
q = torch_npu.npu_rotary_mul(q, cos, sin)
k = torch_npu.npu_rotary_mul(k, cos, sin)
output = torch.cat([q, k], dim=0)

if self.enable_fp32_compute:
output = output.to(origin_dtype)
return output
62 changes: 8 additions & 54 deletions vllm_ascend/patch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,53 +146,7 @@
# Future Plan:
# Identify this pattern in torch-npu and remove this patch.
#
# ** 5. File: worker/patch_qwen2_5_omni.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.qwen2_5_omni_thinker.Qwen2_5OmniThinkerForConditionalGeneration`
# Why:
# we have ascend forward context which doesn't work with upstream.
# How:
# override forward_context in the model file
# Related PR (if no, explain why):
# This is a bug by Ascend only. we should drop set_ascend_forward_context
# Future Plan:
# Remove this patch once forward_context is refactor.
#
# ** 6. File: worker/patch_qwen2_5_vl.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VLForConditionalGeneration`
# Why:
# we have ascend forward context which doesn't work with upstream.
# How:
# override forward_context in the model file
# Related PR (if no, explain why):
# This is a bug by Ascend only. we should drop set_ascend_forward_context
# Future Plan:
# Remove this patch once forward_context is refactor.
#
# 2. `vllm.model_executor.models.qwen2_vl.Qwen2VisionAttention.forward`
# Why:
# the attention is not custom ops
# How:
# make it to custom ops and pluggable
# Related PR (if no, explain why):
# https://github.com/vllm-project/vllm/pull/30125
# Future Plan:
# Remove this patch one the PR is merged into vLLM.
#
# ** 7. File: worker/patch_qwen3_vl.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.qwen3_vl.Qwen3_VisionTransformer.forward`
# Why:
# the attention is not custom ops
# How:
# make it to custom ops and pluggable
# Related PR (if no, explain why):
# https://github.com/vllm-project/vllm/pull/30125
# Future Plan:
# Remove this patch one the PR is merged into vLLM.
#
# ** 8. File: worker/patch_roberta.py **
# ** 5. File: worker/patch_roberta.py **
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.bert `
# Why:
Expand All @@ -204,7 +158,7 @@
# Future Plan:
# Revert this when CANN support shift aclnn operation
#
# ** 9. File: worker/patch_triton.py**
# ** 6. File: worker/patch_triton.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.layers.mamba.ops`, `vllm.model_executor.layers.fla.ops`
# Why:
Expand All @@ -216,7 +170,7 @@
# Future Plan:
# Remove this patch when vLLM support the dispatch function.
#
# ** 10. File: worker/patch_weight_loader.py**
# ** 7. File: worker/patch_weight_loader.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.layers.linear.UnquantizedLinearMethod`
# Why:
Expand All @@ -228,7 +182,7 @@
# Future Plan:
# Remove this patch when the bug is fixed.
#
# ** 11. File: worker/patch_qwen3_next_mtp.py**
# ** 8. File: worker/patch_qwen3_next_mtp.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.worker.utils.bind_kv_cache`
# Why:
Expand All @@ -241,7 +195,7 @@
# Future Plan:
# Remove this patch after discussing with vllm community and adapting bind_kv_cache to npu.
#
# ** 12. File: worker/patch_module.py**
# ** 9. File: worker/patch_module.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.attention.backends.gdn_attn.torch.argsort`
# Why:
Expand All @@ -257,7 +211,7 @@
# Remove this patch when bool is supported in 'torch.argsort' func of npu.
# Make 'torch.argsort' in `vllm.v1.attention.backends.gdn_attn` be stable.
#
# ** 13. File: worker/patch_rejection_sampler.py**
# ** 10. File: worker/patch_rejection_sampler.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.v1.sample.rejection_sampler`
# Why:
Expand All @@ -273,7 +227,7 @@
# to override them, then delete the patch file `worker/patch_rejection_sampler.py`.
# 2. make these functions as costom op, then remove AscendRejectionSampler
#
# ** 14.File: worker/patch_qwen3_next.py**
# ** 11.File: worker/patch_qwen3_next.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet.forward`
# Why:
Expand All @@ -285,7 +239,7 @@
# Future Plan:
# Remove this patch when vLLM support these operators.
#
# ** 15. File: worker/patch_qwen3_next.py**
# ** 12. File: worker/patch_qwen3_next.py**
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. `vllm.model_executor.models.qwen3_next.Qwen3NextGatedDeltaNet._forward_core`
# Why:
Expand Down
2 changes: 0 additions & 2 deletions vllm_ascend/patch/worker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
import vllm_ascend.patch.worker.patch_weight_loader # noqa
import vllm_ascend.patch.worker.patch_multimodal_merge # noqa
import vllm_ascend.patch.worker.patch_minicpm # noqa
import vllm_ascend.patch.worker.patch_qwen2_5_vl # noqa
import vllm_ascend.patch.worker.patch_qwen2_5_omni # noqa
import vllm_ascend.patch.worker.patch_rope # noqa
import vllm_ascend.patch.worker.patch_qwen3_next # noqa
import vllm_ascend.patch.worker.patch_qwen3_next_mtp # noqa
Expand Down
66 changes: 0 additions & 66 deletions vllm_ascend/patch/worker/patch_qwen2_5_omni.py

This file was deleted.

Loading
Loading