diff --git a/.github/workflows/_pre_commit.yml b/.github/workflows/_pre_commit.yml index dc848c2a088..0e6b6ddb153 100644 --- a/.github/workflows/_pre_commit.yml +++ b/.github/workflows/_pre_commit.yml @@ -38,6 +38,7 @@ jobs: repository: vllm-project/vllm path: ./vllm-empty ref: ${{ inputs.vllm }} + - uses: dorny/paths-filter@v3 id: filter with: @@ -62,10 +63,12 @@ jobs: run: | git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend pre-commit run --all-files --hook-stage manual --show-diff-on-failure + - name: Run mypy run: | PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty" export PYTHONPATH + env git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend # Run mypy for Python 3.10, 3.11, 3.12 manually # Note: We are now separating mypy from pre-commit hooks for performance reasons. diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index dfc8047f55e..bfb577ac8a0 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=v0.15.0 + VLLM_COMMIT=80f921ba4bab2ea251d149305ea0f912c6fc218a echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 17801c1c04d..2f344c969eb 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=v0.15.0 +ARG VLLM_COMMIT=80f921ba4bab2ea251d149305ea0f912c6fc218a RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 99817ed1627..f3b339fc6bf 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 7a5eb5b9dfe..4020a8a28f6 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: v0.15.0 + vllm: 80f921ba4bab2ea251d149305ea0f912c6fc218a changes: runs-on: linux-aarch64-a2-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index 6c76109963e..920dc4bf8dc 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [v0.15.0] + vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 6976f24e972..dc1f55b5dcc 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index 553c715fd15..51133d806e2 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -9,6 +9,7 @@ from vllm_ascend.ascend_config import init_ascend_config from vllm_ascend.eplb.core.eplb_utils import init_eplb_config +from vllm_ascend.utils import vllm_version_is # isort: on @@ -21,7 +22,13 @@ def setUp(self, mock_fix_incompatible_config): "eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2}, } from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + if vllm_version_is("0.15.0"): + moe_parallel_config = FusedMoEParallelConfig( + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + else: + moe_parallel_config = FusedMoEParallelConfig( + 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", + is_sequence_parallel=False, enable_eplb=True) moe_config = FusedMoEConfig( num_experts=8, experts_per_token=8, diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py index d450114585b..22503a574c5 100644 --- a/tests/ut/ops/test_mla.py +++ b/tests/ut/ops/test_mla.py @@ -82,8 +82,13 @@ def setUp(self): @patch("vllm_ascend.ops.mla.get_tensor_model_parallel_world_size") def test_initialization(self, mock_tp_size, mock_ascend_config, mock_get_vllm_config): + # Create a proper mock for MLAAttention that has the required attributes + mock_mla_attn = MagicMock() + mock_mla_attn.process_weights_after_loading = MagicMock() + mock_mla_attn.impl = MagicMock() + mock_mla_attn.impl.process_weights_after_loading = MagicMock() - with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True): + with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn): mock_tp_size.return_value = 2 mock_ascend_config.return_value.enable_shared_expert_dp = True mock_vllm_config = MagicMock(spec=VllmConfig) @@ -126,7 +131,14 @@ def test_forward(self, mock_get_forward_context, mock_tp_size, num_hidden_layers=32, first_k_dense_replace=False) mock_get_vllm_config.return_value = mock_vllm_config mock_vllm_config.compilation_config = CompilationConfig() - with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True): + + # Create a proper mock for MLAAttention that has the required attributes + mock_mla_attn = MagicMock() + mock_mla_attn.process_weights_after_loading = MagicMock() + mock_mla_attn.impl = MagicMock() + mock_mla_attn.impl.process_weights_after_loading = MagicMock() + + with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn): attn = AscendMultiHeadLatentAttention( hidden_size=self.hidden_size, num_heads=self.num_heads, diff --git a/tests/ut/quantization/test_modelslim_config.py b/tests/ut/quantization/test_modelslim_config.py index 667a7c0d8e5..2a9e0215b96 100644 --- a/tests/ut/quantization/test_modelslim_config.py +++ b/tests/ut/quantization/test_modelslim_config.py @@ -1,6 +1,5 @@ from unittest.mock import MagicMock, patch -from vllm.attention.layer import Attention from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.linear import LinearBase @@ -8,7 +7,12 @@ from tests.ut.base import TestBase from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig -from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD +from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention class TestAscendModelSlimConfig(TestBase): diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py index 0a6cbfb54f8..57eabef5825 100644 --- a/tests/ut/spec_decode/test_eagle_proposer.py +++ b/tests/ut/spec_decode/test_eagle_proposer.py @@ -28,12 +28,15 @@ def setUp(self): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None self.mock_cpugpubuffer = patch( @@ -141,12 +144,15 @@ def setUp(self): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) @@ -285,12 +291,15 @@ def setUp(self): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.model_config.use_mla = False self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(4) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) @@ -404,12 +413,15 @@ def setUp(self): self.vllm_config.model_config.dtype = torch.float16 self.vllm_config.model_config.max_model_len = 2048 self.vllm_config.model_config.uses_mrope = False + self.vllm_config.model_config.uses_xdrope_dim = 0 self.vllm_config.parallel_config.tensor_parallel_size = 1 + self.vllm_config.parallel_config.data_parallel_rank = 0 self.vllm_config.speculative_config.draft_tensor_parallel_size = 1 self.vllm_config.speculative_config.num_speculative_tokens = 2 self.vllm_config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) + self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0 self.vllm_config.additional_config = None init_ascend_config(self.vllm_config) diff --git a/tests/ut/spec_decode/test_mtp_proposer.py b/tests/ut/spec_decode/test_mtp_proposer.py index 29a55c06021..c6d28185d4c 100644 --- a/tests/ut/spec_decode/test_mtp_proposer.py +++ b/tests/ut/spec_decode/test_mtp_proposer.py @@ -34,6 +34,7 @@ def vllm_config(self): config.speculative_config.draft_model_config = MagicMock() config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096 config.speculative_config.draft_model_config.uses_mrope = False + config.speculative_config.draft_model_config.uses_xdrope_dim = 0 config.speculative_config.speculative_token_tree = str([ (i + 1) * (0, ) for i in range(2) ]) @@ -42,9 +43,11 @@ def vllm_config(self): config.model_config.dtype = torch.float16 config.model_config.max_model_len = 2048 config.model_config.uses_mrope = False + config.model_config.uses_xdrope_dim = 0 config.model_config.hf_text_config = None config.model_config.hf_config = None config.parallel_config.tensor_parallel_size = 1 + config.parallel_config.data_parallel_rank = 0 config.speculative_config.draft_tensor_parallel_size = 1 config.load_config = None diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py index e3c4b0e86d2..7d936ab23d7 100644 --- a/vllm_ascend/attention/mla_v1.py +++ b/vllm_ascend/attention/mla_v1.py @@ -1450,6 +1450,28 @@ def _mla_preprocess(self, layer_name, hidden_states, kv_cache, attn_metadata, ne def get_num_actual_tokens(self, attn_metadata: M): return attn_metadata.num_actual_tokens + def forward_mha( + self, + layer_name: str, + hidden_states: torch.Tensor, + kv_cache: tuple[torch.Tensor], + attn_metadata: M, + need_gather_q_kv: bool = False, + output: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError("forward_mha is not supported for MLA attention. Use forward() instead.") + + def forward_mqa( + self, + layer_name: str, + hidden_states: torch.Tensor, + kv_cache: tuple[torch.Tensor], + attn_metadata: M, + need_gather_q_kv: bool = False, + output: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError("forward_mqa is not supported for MLA attention. Use forward() instead.") + def forward( self, layer_name, diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 26f8c92705a..e2271956c41 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -1062,3 +1062,24 @@ def _handle_o_proj_weight_switch_and_forward( torch.distributed.all_to_all_single(attn_output, send, group=get_tp_group().device_group) return attn_output, True + + def forward_mha( + self, + q: torch.Tensor, + kv_c_normed: torch.Tensor, + k_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: M, + k_scale: torch.Tensor, + output: torch.Tensor, + ) -> None: + raise NotImplementedError("forward_mha is not supported for SFA attention. Use forward() instead.") + + def forward_mqa( + self, + q: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: M, + layer, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + raise NotImplementedError("forward_mqa is not supported for SFA attention. Use forward() instead.") diff --git a/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py index 8586e6d9d18..984a0579826 100644 --- a/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/npugraph_ex_passes/graphex_qknorm_rope_fusion_pass.py @@ -18,7 +18,6 @@ import torch import torchair -from vllm.attention.layer import Attention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger @@ -27,6 +26,12 @@ check_and_register_fusion_pass, extra_stream_scope_check, ) +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention class GraphEXQKNormRopeFusionPattern: diff --git a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py index f9dbf7685d5..29b8ed843e8 100644 --- a/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py +++ b/vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py @@ -18,12 +18,18 @@ import torch import torch._inductor.pattern_matcher as pm from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter -from vllm.attention.layer import Attention from vllm.compilation.vllm_inductor_pass import VllmInductorPass from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.config.compilation import Range from vllm.logger import logger +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention + class QKNormRopeFusionPattern: def __init__(self, vllm_config, head_dim, num_heads, num_kv_heads, eps=1e-6): diff --git a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py index c9d2cc1d6d7..614372dad87 100644 --- a/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py +++ b/vllm_ascend/distributed/kv_transfer/kv_pool/cpu_offload/cpu_offload_connector.py @@ -10,7 +10,6 @@ from typing import TYPE_CHECKING, Any, Optional import torch -from vllm.attention.layer import Attention, MLAAttention from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole @@ -27,6 +26,7 @@ MetadataServerProc, MLAConfig, ) +from vllm_ascend.utils import vllm_version_is if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -35,6 +35,11 @@ from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.request import Request +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention, MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention, MLAAttention + @dataclass class ReqMeta: diff --git a/vllm_ascend/kv_offload/cpu_npu.py b/vllm_ascend/kv_offload/cpu_npu.py index 76b1926fb3b..98a4d892101 100644 --- a/vllm_ascend/kv_offload/cpu_npu.py +++ b/vllm_ascend/kv_offload/cpu_npu.py @@ -6,6 +6,8 @@ from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.worker import OffloadingHandler, TransferResult, TransferSpec +from vllm_ascend.utils import vllm_version_is + logger = init_logger(__name__) @@ -153,12 +155,30 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool: def get_finished(self) -> list[TransferResult]: results: list[TransferResult] = [] - for job_id, event in self.transfer_events.items(): - if event.query(): - results.append((job_id, True)) - self.events_pool.append(event) - for job_id, _ in results: - del self.transfer_events[job_id] + if vllm_version_is("v0.15.0"): + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append((job_id, True)) + self.events_pool.append(event) + for job_id, _ in results: + del self.transfer_events[job_id] + else: + finished_job_ids = [] + for job_id, event in self.transfer_events.items(): + if event.query(): + results.append( + TransferResult( + job_id=job_id, + success=True, + transfer_size=None, + transfer_time=None, + transfer_type=None, + ) + ) + finished_job_ids.append(job_id) + self.events_pool.append(event) + for job_id in finished_job_ids: + del self.transfer_events[job_id] return results def wait(self, job_ids: set[int]) -> None: diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index c2a3f5766e1..bf3bda6c0be 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -23,7 +23,6 @@ import torch from torch import nn -from vllm.attention.layer import MLAAttention from vllm.config import CacheConfig, get_current_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import ForwardContext, get_forward_context @@ -34,6 +33,12 @@ from vllm.v1.attention.backend import AttentionMetadata # type: ignore from vllm_ascend.ascend_config import get_ascend_config +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import MLAAttention class IndexerWrapper(nn.Module): @@ -125,6 +130,16 @@ def __init__( o_proj=mla_modules.o_proj, ) + original_process_weights = self.mla_attn.process_weights_after_loading + + def wrapped_process_weights(act_dtype: torch.dtype): + from vllm_ascend.attention.sfa_v1 import AscendSFAImpl + if not isinstance(self.mla_attn.impl, AscendSFAImpl): + original_process_weights(act_dtype) + self.mla_attn.impl.process_weights_after_loading(act_dtype) + + self.mla_attn.process_weights_after_loading = wrapped_process_weights + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") diff --git a/vllm_ascend/patch/worker/__init__.py b/vllm_ascend/patch/worker/__init__.py index d214dbad05a..2fd0498f7c1 100644 --- a/vllm_ascend/patch/worker/__init__.py +++ b/vllm_ascend/patch/worker/__init__.py @@ -33,3 +33,4 @@ import vllm_ascend.patch.worker.patch_rejection_sampler # noqa import vllm_ascend.patch.worker.patch_qwen3_next # noqa import vllm_ascend.patch.worker.patch_v2_egale # noqa +import vllm_ascend.patch.worker.patch_huanyuan_vl # noqa diff --git a/vllm_ascend/patch/worker/patch_huanyuan_vl.py b/vllm_ascend/patch/worker/patch_huanyuan_vl.py new file mode 100644 index 00000000000..7637177614b --- /dev/null +++ b/vllm_ascend/patch/worker/patch_huanyuan_vl.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#from collections.abc import Iterable + +from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor + +_original_call = HunYuanVLProcessor.__call__ + +def _patched_call(self, images=None, text=None, videos=None, **kwargs): + """Remove add_special_tokens requirement.""" + kwargs.pop("add_special_tokens", None) + return _original_call(self, images=images, text=text, videos=videos, **kwargs) + +HunYuanVLProcessor.__call__ = _patched_call \ No newline at end of file diff --git a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py index e150d36fe05..cf97bc14d90 100644 --- a/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py +++ b/vllm_ascend/patch/worker/patch_qwen3_next_mtp.py @@ -1,8 +1,12 @@ import torch import vllm.v1.worker.utils as utils -from vllm.attention.layer import Attention from vllm.v1.worker.utils import defaultdict, extract_layer_index +from vllm_ascend.utils import vllm_version_is +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention # Without this patch, it will raise an exception when initialize kv_cache. # TODO To remove the patch, we need check why the original bind_kv_cache raises an NotImplementedError. diff --git a/vllm_ascend/quantization/modelslim_config.py b/vllm_ascend/quantization/modelslim_config.py index 0303b0dc6b0..227b97fbaae 100644 --- a/vllm_ascend/quantization/modelslim_config.py +++ b/vllm_ascend/quantization/modelslim_config.py @@ -401,7 +401,13 @@ def get_quant_method(self, layer: torch.nn.Module, self.packed_modules_mapping = packed_modules_model_mapping[ model_type] prefix = self.quant_prefix_mapper(model_type, prefix) - from vllm.attention.layer import Attention + + from vllm_ascend.utils import vllm_version_is + if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention # type: ignore + else: + from vllm.model_executor.layers.attention import Attention + if prefix.startswith("language_model"): prefix = prefix.split('.', 1)[-1] if isinstance(layer, LinearBase): diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py index 8864155be1e..986d7a710dc 100644 --- a/vllm_ascend/spec_decode/eagle_proposer.py +++ b/vllm_ascend/spec_decode/eagle_proposer.py @@ -41,7 +41,7 @@ from vllm_ascend.ops.triton.spec_decode.utils import \ prepare_inputs_padded_kernel from vllm_ascend.ops.triton.triton_utils import get_vectorcore_num -from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable +from vllm_ascend.utils import enable_sp, shared_expert_dp_enabled, lmhead_tp_enable, vllm_version_is # Currently we will fix block size to a small one since `num_reqs` can't be too large _PREPARE_INPUTS_BLOCK_SIZE = 4 @@ -400,6 +400,12 @@ def dummy_run(self, is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index before first model call + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + self._runnable( num_input_tokens=num_tokens, batch_size=batch_size, @@ -559,6 +565,12 @@ def _propose( is_draft_model=True, draft_attn_metadatas=multi_steps_attn_metadata): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for forward pass + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + draft_token_ids = self._runnable( num_input_tokens=num_input_tokens, batch_size=batch_size, @@ -660,6 +672,12 @@ def _run_merged_draft(self, forward_context.num_accept_tokens = batch_size for draft_step in range(self.num_speculative_tokens - 1): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each draft step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + # Update the inputs. # cast to int32 is crucial when eagle model is compiled. # tensor.argmax() returns int64 by default. diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py index 5a4326ab27a..03f21fed70f 100644 --- a/vllm_ascend/spec_decode/mtp_proposer.py +++ b/vllm_ascend/spec_decode/mtp_proposer.py @@ -18,7 +18,7 @@ from vllm_ascend.compilation.acl_graph import ACLGraphWrapper from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla from vllm_ascend.spec_decode.eagle_proposer import EagleProposer -from vllm_ascend.utils import lmhead_tp_enable +from vllm_ascend.utils import lmhead_tp_enable, vllm_version_is class MtpProposer(EagleProposer): @@ -122,6 +122,11 @@ def dummy_run(self, batch_descriptor=batch_descriptor, is_draft_model=True, in_profile_run=is_profile): + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each MTP step iteration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 previous_hidden_states, positions = self.maybe_pad_and_reduce( previous_hidden_states, positions) self.model(input_ids=input_ids, @@ -330,6 +335,13 @@ def _propose( batch_descriptor=batch_descriptor, num_actual_tokens=num_tokens, is_draft_model=True): + + if not vllm_version_is("v0.15.0"): + # Reset MOE layer index for each MTP step to match all_moe_layers registration + forward_context = get_forward_context() + if forward_context is not None: + forward_context.moe_layer_index = 0 + with record_function_or_nullcontext('mtp_forward'): model_kwargs = {} model_kwargs["attn_metadata"] = attn_metadata diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index 7ecebb60ae8..1682288c8c2 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -30,7 +30,6 @@ import torch import torch.distributed as dist import torch.nn as nn -from vllm.attention.layer import Attention, MLAAttention from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.config import CompilationMode, CUDAGraphMode, VllmConfig, get_layers_from_vllm_config from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather @@ -137,6 +136,12 @@ else: xgr = LazyLoader("xgr", globals(), "xgrammar") +from vllm_ascend.utils import vllm_version_is + +if vllm_version_is("v0.15.0"): + from vllm.attention.layer import Attention, MLAAttention # type: ignore +else: + from vllm.model_executor.layers.attention import Attention, MLAAttention # if true, allow tensor initialization and casting with internal format (e.g., NZ) torch.npu.config.allow_internal_format = True