diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml index 1fe70ef3538..0ccb950cee1 100644 --- a/.github/workflows/_e2e_test.yaml +++ b/.github/workflows/_e2e_test.yaml @@ -36,7 +36,7 @@ on: continue_on_error: required: false type: boolean - default: false + default: true # The following inputs are used by comment-triggered E2E tests (/e2e ). # They carry space-separated pytest paths, categorized by runner type. # Leave empty (default) when running label-triggered full/light suites. diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index 24fbaab9497..f7d3f3e13f3 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=6f786f2c506cb07f4566771fdc62e640e2c4a176 +ARG VLLM_COMMIT=ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8 RUN git init /vllm-workspace/vllm && \ git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \ git -C /vllm-workspace/vllm checkout FETCH_HEAD diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 8e6b51fd7bf..b9d697050fc 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -80,7 +80,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] + vllm_version: [ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8, v0.19.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 767c3adcc7e..51d77391794 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 6f786f2c506cb07f4566771fdc62e640e2c4a176 + vllm: ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8 changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -92,7 +92,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] + vllm_version: [ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8, v0.19.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -104,7 +104,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [6f786f2c506cb07f4566771fdc62e640e2c4a176, v0.19.0] + vllm_version: [ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8, v0.19.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/docs/source/conf.py b/docs/source/conf.py index f01562dc68d..84b69fa2947 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,7 +80,7 @@ # CANN image tag "cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11", # vLLM commit hash for main branch - "main_vllm_commit": "6f786f2c506cb07f4566771fdc62e640e2c4a176", + "main_vllm_commit": "ccaf5ffaa3e1fb2a081b2c9e403ac0e4dfc142c8", # vLLM tag for main branch "main_vllm_tag": "v0.19.0", # Python version for main branch diff --git a/requirements.txt b/requirements.txt index 28ce4b7b03b..802420d386a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,6 +35,6 @@ numba torch-npu==2.9.0 arctic-inference==0.1.1 -transformers>=4.57.4 +transformers>=4.57.4, <5.0 fastapi<0.124.0 triton-ascend==3.2.0 diff --git a/tests/ut/ops/test_mla.py b/tests/ut/ops/test_mla.py index 870daed448e..1cffe1e4f2d 100644 --- a/tests/ut/ops/test_mla.py +++ b/tests/ut/ops/test_mla.py @@ -8,6 +8,7 @@ from tests.ut.base import TestBase from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention, IndexerWrapper +from vllm_ascend.utils import vllm_version_is class TestIndexerWrapper(TestBase): @@ -19,8 +20,11 @@ def test_initialization(self): mock_indexer.topk_tokens = 2048 mock_indexer.q_lora_rank = 1536 mock_indexer.wq_b = nn.Linear(128, 128) - mock_indexer.wk = nn.Linear(128, 128) - mock_indexer.weights_proj = nn.Linear(128, 128) + if vllm_version_is("0.19.0"): + mock_indexer.wk = nn.Linear(128, 128) + mock_indexer.weights_proj = nn.Linear(128, 128) + else: + mock_indexer.wk_weights_proj = nn.Linear(128, 128) mock_indexer.k_norm = nn.LayerNorm(128) mock_indexer.softmax_scale = 0.123 mock_indexer.topk_indices_buffer = torch.randn(10) @@ -33,8 +37,11 @@ def test_initialization(self): self.assertEqual(wrapper.topk_tokens, 2048) self.assertEqual(wrapper.q_lora_rank, 1536) self.assertIs(wrapper.wq_b, mock_indexer.wq_b) - self.assertIs(wrapper.wk, mock_indexer.wk) - self.assertIs(wrapper.weights_proj, mock_indexer.weights_proj) + if vllm_version_is("0.19.0"): + self.assertIs(wrapper.wk, mock_indexer.wk) + self.assertIs(wrapper.weights_proj, mock_indexer.weights_proj) + else: + self.assertIs(wrapper.wk_weights_proj, mock_indexer.wk_weights_proj) self.assertIs(wrapper.k_norm, mock_indexer.k_norm) self.assertEqual(wrapper.softmax_scale, 0.123) diff --git a/vllm_ascend/_310p/fused_moe/fused_moe.py b/vllm_ascend/_310p/fused_moe/fused_moe.py index 9fa1dc187ed..2f7aa0646a8 100644 --- a/vllm_ascend/_310p/fused_moe/fused_moe.py +++ b/vllm_ascend/_310p/fused_moe/fused_moe.py @@ -20,7 +20,6 @@ from vllm.distributed import get_dp_group, get_ep_group, get_tp_group from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute @@ -29,6 +28,9 @@ from vllm_ascend.quantization.quant_type import QuantType from vllm_ascend.utils import vllm_version_is +if vllm_version_is("0.19.0"): + from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE # type: ignore[no-redef] + from .experts_selector import select_experts from .moe_comm_method import AllGatherCommImpl310 @@ -118,7 +120,12 @@ def apply( class AscendFusedMoE310(FusedMoE): def __init__(self, *args, **kwargs): + is_legacy = vllm_version_is("0.19.0") + if not is_legacy: + _routed_input_transform = kwargs.get("routed_input_transform") super().__init__(*args, **kwargs) + if not is_legacy: + self.reduce_results = False self.global_num_experts = kwargs["num_experts"] @@ -164,18 +171,29 @@ def __init__(self, *args, **kwargs): from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner - is_legacy = vllm_version_is("0.19.0") - self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self.gate if is_legacy else kwargs.pop("gate", None), - self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) + if is_legacy: + self.runner = AscendMoERunner( + self, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self.shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) + else: + self.runner = AscendMoERunner( + self.layer_name, + self.moe_config, + self.router, + _routed_input_transform, + kwargs.get("gate"), + kwargs.get("shared_experts"), + self.quant_method, + self.vllm_config.parallel_config.enable_dbo, + ) def init_experts_map(self, moe_config): """ @@ -221,6 +239,9 @@ def get_quant_type(self) -> QuantType: raise RuntimeError("Only Unquant and W8A8 is supported.") return quant_type + def maybe_init_modular_kernel(self) -> None: + return None + def forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor ) -> torch.Tensor: @@ -263,7 +284,10 @@ def forward_impl( # type: ignore[override] return routed_out -class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310): +_SharedFusedMoEBase310 = (SharedFusedMoE, AscendFusedMoE310) if vllm_version_is("0.19.0") else (AscendFusedMoE310,) + + +class AscendSharedFusedMoE310(*_SharedFusedMoEBase310): # type: ignore[misc] def __init__( self, shared_experts: torch.nn.Module, @@ -286,17 +310,29 @@ def __init__( from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner is_legacy = vllm_version_is("0.19.0") - self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self._gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) + if is_legacy: + self.runner = AscendMoERunner( + self, + self.moe_config, + self.router, + self._routed_input_transform, + self._gate, + self._shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) + else: + self.runner = AscendMoERunner( + self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self._gate, + self._shared_experts, + self.quant_method, + self.vllm_config.parallel_config.enable_dbo, + ) @property def is_internal_router(self) -> bool: diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py index 35f006b3822..39c9013d5ba 100644 --- a/vllm_ascend/ascend_forward_context.py +++ b/vllm_ascend/ascend_forward_context.py @@ -20,6 +20,7 @@ is_drafter_moe_model, is_moe_model, speculative_enable_dispatch_gmm_combine_decode, + vllm_version_is, ) @@ -153,7 +154,11 @@ def set_ascend_forward_context( dp_world_size = get_dp_group().world_size if dp_world_size > 1 and forward_context.dp_metadata is not None: - max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item() + dp_meta = forward_context.dp_metadata + if vllm_version_is("0.19.0"): + max_tokens_across_dp = dp_meta.max_tokens_across_dp_cpu.item() + else: + max_tokens_across_dp = dp_meta.num_tokens_across_dp_cpu.max().item() if forward_context.flash_comm_v1_enabled or forward_context.flashcomm_v2_enabled: padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size pad_size = padded_length - num_tokens diff --git a/vllm_ascend/attention/context_parallel/sfa_cp.py b/vllm_ascend/attention/context_parallel/sfa_cp.py index 0214921a80a..5aca45ba039 100644 --- a/vllm_ascend/attention/context_parallel/sfa_cp.py +++ b/vllm_ascend/attention/context_parallel/sfa_cp.py @@ -12,6 +12,7 @@ from vllm_ascend.attention.sfa_v1 import AscendSFAImpl, AscendSFAMetadata, AscendSFAMetadataBuilder from vllm_ascend.attention.utils import AscendCommonAttentionMetadata, enabling_mlapo, split_decodes_and_prefills from vllm_ascend.ops.triton.rope import rope_forward_triton_siso +from vllm_ascend.utils import vllm_version_is M = TypeVar("M", bound=AscendSFAMetadata) @@ -385,7 +386,11 @@ def indexer_select_post_process( actual_seq_lengths_query: torch.Tensor, actual_seq_lengths_key: torch.Tensor, ): - weights, _ = self.weights_proj(x) + if vllm_version_is("0.19.0"): + weights, _ = self.weights_proj(x) + else: + kw, _ = self.wk_weights_proj(x) + weights = kw[:, self.head_dim :] q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128] q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128] diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py index 4c03a60de68..ebef2984a3f 100644 --- a/vllm_ascend/attention/sfa_v1.py +++ b/vllm_ascend/attention/sfa_v1.py @@ -55,6 +55,7 @@ enable_dsa_cp_with_o_proj_tp, get_weight_prefetch_method, maybe_trans_nz, + vllm_version_is, ) from vllm_ascend.worker.npu_input_batch import NPUInputBatch @@ -438,8 +439,12 @@ def __init__( self.n_head: int = self.indexer.n_head # 64 self.head_dim: int = self.indexer.head_dim # 128 self.wq_b = self.indexer.wq_b - self.wk = self.indexer.wk - self.weights_proj = self.indexer.weights_proj + # upstream ac3dac545 fused wk+weights_proj into wk_weights_proj + if vllm_version_is("0.19.0"): + self.wk = self.indexer.wk + self.weights_proj = self.indexer.weights_proj + else: + self.wk_weights_proj = self.indexer.wk_weights_proj self.k_norm = self.indexer.k_norm self.cp_size = 1 self.is_rope_neox_style = True @@ -908,7 +913,11 @@ def indexer_select_pre_process( cos: torch.Tensor, sin: torch.Tensor, ): - k_li, _ = self.wk(x) # [b,s,7168] @ [7168,128] = [b,s,128] + if vllm_version_is("0.19.0"): + k_li, _ = self.wk(x) # [b,s,7168] @ [7168,128] = [b,s,128] + else: + kw, _ = self.wk_weights_proj(x) + k_li = kw[:, : self.head_dim] k_li = self.k_norm(k_li).unsqueeze(1) k_li = k_li.view(-1, 1, self.head_dim) @@ -953,7 +962,11 @@ def indexer_select_post_process( actual_seq_lengths_query: torch.Tensor, actual_seq_lengths_key: torch.Tensor, ): - weights, _ = self.weights_proj(x) + if vllm_version_is("0.19.0"): + weights, _ = self.weights_proj(x) + else: + kw, _ = self.wk_weights_proj(x) + weights = kw[:, self.head_dim :] q_li, _ = self.wq_b(q_c) # [b,s,1536] @ [1536,64*128] = [b,s,64*128] q_li = q_li.view(-1, self.n_head, self.head_dim) # [n_toks,64,128] diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py index d822b362ee4..fac71315e5e 100755 --- a/vllm_ascend/lora/utils.py +++ b/vllm_ascend/lora/utils.py @@ -26,6 +26,7 @@ AscendRowParallelLinear, ) from vllm_ascend.ops.vocab_parallel_embedding import AscendVocabParallelEmbedding +from vllm_ascend.utils import vllm_version_is class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): @@ -184,16 +185,27 @@ def can_replace_layer( return type(source_layer) is AscendRowParallelLinear +_ASCEND_LORA_CLASSES = ( + AscendColumnParallelLinearWithLoRA, + AscendMergedColumnParallelLinearWithLoRA, + AscendRowParallelLinearWithLoRA, + AscendVocabParallelEmbeddingWithLoRA, + AscendQKVParallelLinearWithLoRA, + AscendMergedQKVParallelLinearWithLoRA, + AscendColumnParallelLinearWithShardedLoRA, + AscendMergedColumnParallelLinearWithShardedLoRA, + AscendMergedQKVParallelLinearWithShardedLoRA, + AscendQKVParallelLinearWithShardedLoRA, + AscendRowParallelLinearWithShardedLoRA, + AscendReplicatedLinearWithLoRA, +) + + def refresh_all_lora_classes(): - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithLoRA) - vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA) - vllm.lora.utils._all_lora_classes.add(AscendReplicatedLinearWithLoRA) + if vllm_version_is("0.19.0"): + vllm.lora.utils._all_lora_classes.update(_ASCEND_LORA_CLASSES) + return + + vllm.lora.utils._all_lora_classes = tuple( + dict.fromkeys((*_ASCEND_LORA_CLASSES, *vllm.lora.utils._all_lora_classes)) + ) diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index 53651ddf912..fd17076b771 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -29,8 +29,6 @@ from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer -from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore -from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE import vllm_ascend.envs as envs_ascend from vllm_ascend.ascend_config import get_ascend_config @@ -52,6 +50,19 @@ vllm_version_is, ) +if vllm_version_is("0.19.0"): + from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( + DefaultMoERunner as _MoERunnerBase, # type: ignore[no-redef] + ) + from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE # type: ignore[no-redef] +else: + from vllm.model_executor.layers.fused_moe.runner.moe_runner import ( + MoERunner as _MoERunnerBase, # type: ignore[no-redef] + ) + from vllm.model_executor.layers.fused_moe.runner.shared_experts import ( # type: ignore[no-redef] + SharedExpertsOrder, + ) + @dataclass class FusedMoEResult: @@ -221,13 +232,50 @@ def apply( # Please remove this inheritance after extending vllm, todo(wxs) -class AscendMoERunner(DefaultMoERunner): +class AscendMoERunner(_MoERunnerBase): @property def use_dp_chunking(self) -> bool: """Ascend uses its own forward_impl path, not the FlashInfer Cutlass chunked path. Always return False to stay on forward_impl.""" return False + def _forward_impl( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if not vllm_version_is("0.19.0"): + self._maybe_sync_shared_experts_stream(shared_experts_input) + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + + with self._sequence_parallel_context(): + if not vllm_version_is("0.19.0"): + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.NO_OVERLAP, + ) + + routed_out = self.forward_impl( + layer, + hidden_states, + router_logits, + shared_experts_input, + ) + if vllm_version_is("0.19.0"): + return routed_out + + self._maybe_apply_shared_experts( + shared_experts_input, + SharedExpertsOrder.MULTI_STREAM_OVERLAPPED, + ) + if self.shared_experts is not None: + return self.shared_experts.output, routed_out + + return routed_out + # TODO: Remove this after drop v0.19.0 support def forward_impl( self, @@ -268,7 +316,16 @@ class AscendFusedMoE(FusedMoE): gate_stream: torch.npu.Stream | None = None def __init__(self, *args, **kwargs): + is_legacy = vllm_version_is("0.19.0") + if not is_legacy: + _routed_input_transform = kwargs.get("routed_input_transform") + _routed_output_transform = kwargs.get("routed_output_transform") + _runner_routed_scaling_factor = ( + kwargs.get("routed_scaling_factor", 1.0) if kwargs.get("apply_routed_scale_to_output", False) else 1.0 + ) super().__init__(*args, **kwargs) + if not is_legacy: + self.reduce_results = False num_experts = kwargs["num_experts"] intermediate_size = kwargs["intermediate_size"] @@ -358,17 +415,31 @@ def __init__(self, *args, **kwargs): self.quant_type = self._get_quant_type() is_legacy = vllm_version_is("0.19.0") - self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self.gate if is_legacy else kwargs.pop("gate", None), - self.shared_experts if is_legacy else kwargs.pop("shared_experts", None), - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) + if is_legacy: + self.runner = AscendMoERunner( + self, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self.shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) + else: + self.runner = AscendMoERunner( + self.layer_name, + self.moe_config, + self.router, + _routed_input_transform, + kwargs.pop("gate", None), + kwargs.pop("shared_experts", None), + self.quant_method, + self.vllm_config.parallel_config.enable_dbo, + routed_output_transform=_routed_output_transform, + routed_scaling_factor=_runner_routed_scaling_factor, + ) def _get_quant_type(self) -> QuantType: quant_type = QuantType.NONE @@ -400,6 +471,9 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) + def maybe_init_modular_kernel(self) -> None: + return None + def forward( self, hidden_states: torch.Tensor, @@ -546,7 +620,10 @@ def forward_impl( # type: ignore[override] return routed_out -class AscendSharedFusedMoE(SharedFusedMoE, AscendFusedMoE): +_SharedFusedMoEBase = (SharedFusedMoE, AscendFusedMoE) if vllm_version_is("0.19.0") else (AscendFusedMoE,) + + +class AscendSharedFusedMoE(*_SharedFusedMoEBase): # type: ignore[misc] def __init__( self, shared_experts: torch.nn.Module, @@ -584,17 +661,34 @@ def __init__( # FusedMoE.shared_experts is a property that reads self.runner.shared_experts, # which at this point is still the stale runner built with shared_experts=None. is_legacy = vllm_version_is("0.19.0") - self.runner = AscendMoERunner( - self if is_legacy else self.layer_name, - self.moe_config, - self.router, - self._routed_input_transform, - self.gate, - self._shared_experts, - self.quant_method, - self.reduce_results, - self.vllm_config.parallel_config.enable_dbo, - ) + if is_legacy: + self.runner = AscendMoERunner( + self, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self._shared_experts, + self.quant_method, + self.reduce_results, + self.vllm_config.parallel_config.enable_dbo, + ) + else: + runner_routed_scaling_factor = ( + kwargs.get("routed_scaling_factor", 1.0) if kwargs.get("apply_routed_scale_to_output", False) else 1.0 + ) + self.runner = AscendMoERunner( + self.layer_name, + self.moe_config, + self.router, + self._routed_input_transform, + self.gate, + self._shared_experts, + self.quant_method, + self.vllm_config.parallel_config.enable_dbo, + routed_output_transform=kwargs.get("routed_output_transform"), + routed_scaling_factor=runner_routed_scaling_factor, + ) if self.multistream_overlap_shared_expert: # Wrap the quant_method's process_weights_after_loading to validate that diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py index 2c3a8b0fad2..ec0ed8236e5 100644 --- a/vllm_ascend/ops/layernorm.py +++ b/vllm_ascend/ops/layernorm.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm, RMSNormGated from vllm_ascend.ops.triton.layernorm_gated import layer_norm_fwd_npu -from vllm_ascend.utils import enable_custom_op, get_weight_prefetch_method +from vllm_ascend.utils import enable_custom_op, get_weight_prefetch_method, vllm_version_is class AscendRMSNorm(RMSNorm): @@ -112,7 +112,18 @@ def forward_oot( class LayerNormFn(torch.autograd.Function): @staticmethod - def forward(ctx, x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, is_rms_norm=False): + def forward( + ctx, + x, + weight, + bias, + z=None, + eps=1e-6, + group_size=None, + norm_before_gate=True, + is_rms_norm=False, + activation: str = "swish", + ): """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))""" x_shape_og = x.shape @@ -156,13 +167,18 @@ def __init__( norm_before_gate: bool = False, device: torch.device | None = None, dtype: torch.dtype | None = None, + activation: str = "swish", ): """If group_size is not None, we do GroupNorm with each group having group_size elements. group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). """ factory_kwargs = {"device": device, "dtype": dtype} - super().__init__(hidden_size, eps, group_size, norm_before_gate, device, dtype) + if vllm_version_is("0.19.0"): + super().__init__(hidden_size, eps, group_size, norm_before_gate, device, dtype) + else: + super().__init__(hidden_size, eps, group_size, norm_before_gate, device, dtype, activation) self.eps = eps + self.activation = activation self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) self.register_parameter("bias", None) self.group_size = group_size @@ -174,4 +190,17 @@ def reset_parameters(self): def forward_oot(self, x, z=None): """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z))""" - return LayerNormFn.apply(x, self.weight, self.bias, z, self.eps, self.group_size, self.norm_before_gate, True) + # assert self.activation in ["silu", "sigmoid", "swish"] + # if not vllm_version_is("0.19.0") and self.activation == "sigmoid": + # return super().forward_native(x, z) + return LayerNormFn.apply( + x, + self.weight, + self.bias, + z, + self.eps, + self.group_size, + self.norm_before_gate, + True, + self.activation, + ) diff --git a/vllm_ascend/ops/mla.py b/vllm_ascend/ops/mla.py index 689ed0cd672..4c318493120 100644 --- a/vllm_ascend/ops/mla.py +++ b/vllm_ascend/ops/mla.py @@ -33,7 +33,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import _EXTRA_CTX -from vllm_ascend.utils import is_vl_model, parse_layer_idx +from vllm_ascend.utils import is_vl_model, parse_layer_idx, vllm_version_is class IndexerWrapper(nn.Module): @@ -41,7 +41,7 @@ class IndexerWrapper(nn.Module): A wrapper of Indexer for Deepseek v3.2. This wrapper is currently used to solve the fp8 hard code issue of vllm's deepseek_v2.py. It wraps the original Indexer, inherits its module weights - (including wq_b, wk, weights_proj, k_norm) + (including wq_b, wk_weights_proj or wk/weights_proj, k_norm) while deletes the unused topk_indices_buffer and k_cache to save memory. TODO: Will be removed once original Indexer supports different quantization methods. """ @@ -54,8 +54,12 @@ def __init__(self, vllm_indexer: nn.Module) -> None: self.topk_tokens: int = vllm_indexer.topk_tokens # 2048 self.q_lora_rank: int = vllm_indexer.q_lora_rank # 1536 self.wq_b = vllm_indexer.wq_b - self.wk = vllm_indexer.wk - self.weights_proj = vllm_indexer.weights_proj + # upstream ac3dac545 fused wk+weights_proj into wk_weights_proj + if vllm_version_is("0.19.0"): + self.wk = vllm_indexer.wk + self.weights_proj = vllm_indexer.weights_proj + else: + self.wk_weights_proj = vllm_indexer.wk_weights_proj self.k_norm = vllm_indexer.k_norm self.softmax_scale = vllm_indexer.softmax_scale vllm_indexer.topk_indices_buffer = None # delete topk_indices_buffer diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index d572c69c6ca..730026ad923 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -48,6 +48,7 @@ update_cudagraph_capture_sizes, is_310p, enable_sp, + vllm_version_is, ) if TYPE_CHECKING: @@ -739,7 +740,10 @@ def set_additional_forward_context( num_tokens = list(attn_metadata.values())[0].num_actual_tokens dp_world_size = get_dp_group().world_size if dp_world_size > 1 and dp_metadata is not None: - max_tokens_across_dp = dp_metadata.max_tokens_across_dp_cpu.item() + if vllm_version_is("0.19.0"): + max_tokens_across_dp = dp_metadata.max_tokens_across_dp_cpu.item() + else: + max_tokens_across_dp = dp_metadata.num_tokens_across_dp_cpu.max().item() if flash_comm_v1_enabled or flashcomm_v2_enabled: padded_length = (max_tokens_across_dp + tp_world_size - 1) // tp_world_size * tp_world_size pad_size = padded_length - num_tokens diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py index 90e7ecf1889..96d7b668869 100644 --- a/vllm_ascend/worker/worker.py +++ b/vllm_ascend/worker/worker.py @@ -59,9 +59,13 @@ enable_sp, get_ascend_device_type, register_ascend_customop, + vllm_version_is, ) from vllm_ascend.worker.model_runner_v1 import NPUModelRunner +if not vllm_version_is("0.19.0"): + from vllm.v1.worker.worker_base import CompilationTimes # noqa: E402 + torch._dynamo.trace_rules.clear_lru_cache() # noqa: E402 from torch._dynamo.variables import TorchInGraphFunctionVariable # noqa: E402 from vllm.utils.torch_utils import set_random_seed # noqa: E402 @@ -433,7 +437,7 @@ def load_model(self) -> None: with context, set_current_vllm_config(self.vllm_config): self.model_runner.load_model() - def compile_or_warm_up_model(self) -> float: + def compile_or_warm_up_model(self): # Note: need to adapt for graph mode. warmup_sizes = (self.vllm_config.compilation_config.compile_sizes or []).copy() if not self.model_config.enforce_eager: @@ -472,7 +476,13 @@ def compile_or_warm_up_model(self) -> float: # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) - return self.vllm_config.compilation_config.compilation_time + if vllm_version_is("0.19.0"): + return self.vllm_config.compilation_config.compilation_time + + return CompilationTimes( + language_model=self.vllm_config.compilation_config.compilation_time, + encoder=self.compilation_config.encoder_compilation_time, + ) def _warm_up_atb(self): x = torch.rand((2, 4), dtype=torch.float16).npu()