diff --git a/.github/workflows/bot_pr_create.yaml b/.github/workflows/bot_pr_create.yaml index 9e3b03c6e10..51e52bed940 100644 --- a/.github/workflows/bot_pr_create.yaml +++ b/.github/workflows/bot_pr_create.yaml @@ -37,7 +37,7 @@ jobs: steps: - name: Get vLLM version run: | - VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007 + VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV" - name: Checkout repository diff --git a/.github/workflows/dockerfiles/Dockerfile.lint b/.github/workflows/dockerfiles/Dockerfile.lint index e62cf389542..c86324ae6b2 100644 --- a/.github/workflows/dockerfiles/Dockerfile.lint +++ b/.github/workflows/dockerfiles/Dockerfile.lint @@ -27,7 +27,7 @@ RUN apt-get update -y && \ ARG VLLM_REPO=https://github.com/vllm-project/vllm.git # For lint purpose, actually we need make a main2main matching. -ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007 +ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd RUN git clone $VLLM_REPO /vllm-workspace/vllm && \ cd /vllm-workspace/vllm && \ git checkout $VLLM_COMMIT diff --git a/.github/workflows/pr_test_full.yaml b/.github/workflows/pr_test_full.yaml index 9319f2933e5..2275bfb043b 100644 --- a/.github/workflows/pr_test_full.yaml +++ b/.github/workflows/pr_test_full.yaml @@ -75,7 +75,7 @@ jobs: name: e2e-full strategy: matrix: - vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] + vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] needs: [changes] if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }} uses: ./.github/workflows/_e2e_test.yaml diff --git a/.github/workflows/pr_test_light.yaml b/.github/workflows/pr_test_light.yaml index 52841726627..4049953451d 100644 --- a/.github/workflows/pr_test_light.yaml +++ b/.github/workflows/pr_test_light.yaml @@ -41,7 +41,7 @@ jobs: lint: uses: ./.github/workflows/_pre_commit.yml with: - vllm: 9562912cead1f11e8540fb91306c5cbda66f0007 + vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd changes: runs-on: linux-aarch64-a2b3-0 outputs: @@ -87,7 +87,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} strategy: matrix: - vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] + vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} @@ -99,7 +99,7 @@ jobs: name: e2e-light strategy: matrix: - vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0] + vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0] # Note (yikun): If CI resource are limited we can split job into two chain jobs needs: [lint, changes] # only trigger e2e test after lint passed and the change is e2e related with pull request. diff --git a/.github/workflows/schedule_codecov_refresh.yaml b/.github/workflows/schedule_codecov_refresh.yaml index ba3fe67fa27..ea7f97eac0b 100644 --- a/.github/workflows/schedule_codecov_refresh.yaml +++ b/.github/workflows/schedule_codecov_refresh.yaml @@ -33,7 +33,7 @@ jobs: name: refresh codecov strategy: matrix: - vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007] + vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd] uses: ./.github/workflows/_unit_test.yaml with: vllm: ${{ matrix.vllm_version }} diff --git a/docs/source/community/versioning_policy.md b/docs/source/community/versioning_policy.md index 55aa06a6d6c..a5b8e39bdd0 100644 --- a/docs/source/community/versioning_policy.md +++ b/docs/source/community/versioning_policy.md @@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL | vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | |-------------|--------------|------------------|-------------|--------------------| -| main | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | +| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 | ## Release cadence diff --git a/requirements.txt b/requirements.txt index 82cca9eada0..ef617fd45fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ setuptools>=64 setuptools-scm>=8 torch==2.9.0 torchvision +torchaudio wheel xgrammar>=0.1.30 pandas-stubs diff --git a/tests/ut/eplb/core/test_eplb_utils.py b/tests/ut/eplb/core/test_eplb_utils.py index e2a3cc85fb7..df49283bff9 100644 --- a/tests/ut/eplb/core/test_eplb_utils.py +++ b/tests/ut/eplb/core/test_eplb_utils.py @@ -25,22 +25,35 @@ def setUp(self, mock_fix_incompatible_config): if vllm_version_is("0.15.0"): moe_parallel_config = FusedMoEParallelConfig( 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True) + moe_config = FusedMoEConfig( + num_experts=8, + experts_per_token=8, + hidden_dim=8192, + intermediate_size_per_partition=5, + num_local_experts=8, + activation="silu", + device="npu", + routing_method=RoutingMethodType.Simulated, + moe_parallel_config=moe_parallel_config, + in_dtype=torch.float16, + ) else: moe_parallel_config = FusedMoEParallelConfig( - 2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", - is_sequence_parallel=False, enable_eplb=True) - moe_config = FusedMoEConfig( - num_experts=8, - experts_per_token=8, - hidden_dim=8192, - intermediate_size_per_partition=5, - num_local_experts=8, - activation="silu", - device="npu", - routing_method=RoutingMethodType.Simulated, - moe_parallel_config=moe_parallel_config, - in_dtype=torch.float16, - ) + 2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl", + enable_eplb=True) + moe_config = FusedMoEConfig( + num_experts=8, + experts_per_token=8, + hidden_dim=8192, + intermediate_size_per_partition=5, + num_local_experts=8, + num_logical_experts=8, + activation="silu", + device="npu", + routing_method=RoutingMethodType.Simulated, + moe_parallel_config=moe_parallel_config, + in_dtype=torch.float16, + ) moe_config.supports_eplb = True self.vllm_config = vllm_config self.moe_config = moe_config diff --git a/vllm_ascend/_310p/model_runner_310p.py b/vllm_ascend/_310p/model_runner_310p.py index 9cdbdae5da4..f0cde39c10a 100644 --- a/vllm_ascend/_310p/model_runner_310p.py +++ b/vllm_ascend/_310p/model_runner_310p.py @@ -236,22 +236,22 @@ def _prepare_input_ids( prev_draft_token_indices.extend(range(start, start + draft_len)) indices_match &= prev_index == flattened_index max_flattened_index = max(max_flattened_index, flattened_index) - num_commmon_tokens = len(sample_flattened_indices) + num_common_tokens = len(sample_flattened_indices) total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens - if num_commmon_tokens < total_without_spec: + if num_common_tokens < total_without_spec: self.input_ids.copy_to_gpu(total_num_scheduled_tokens) if self.enable_prompt_embeds: self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens) self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens) - if num_commmon_tokens == 0: + if num_common_tokens == 0: return - if indices_match and max_flattened_index == (num_commmon_tokens - 1): + if indices_match and max_flattened_index == (num_common_tokens - 1): # NOTE: Override the copy_ function here - indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device) - source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0] + indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device) + source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0] self.input_ids.gpu.index_copy_(0, indices, source) if self.enable_prompt_embeds: - self.is_token_ids.gpu[:num_commmon_tokens] = True + self.is_token_ids.gpu[:num_common_tokens] = True return # Upload the index tensors asynchronously so the scatter can be non-blocking. sampled_tokens_index_tensor = torch.tensor( diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py index b1853b4a0c3..e2300a07035 100644 --- a/vllm_ascend/ops/fused_moe/fused_moe.py +++ b/vllm_ascend/ops/fused_moe/fused_moe.py @@ -28,6 +28,13 @@ from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE +from vllm_ascend.utils import vllm_version_is + +if not vllm_version_is("0.15.0"): + from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore + from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore + from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore + from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ascend_forward_context import MoECommType from vllm_ascend.distributed.parallel_state import get_mc2_group @@ -154,6 +161,77 @@ def apply( return final_hidden_states +if not vllm_version_is("0.15.0"): + # Please remove this inheritance after extending vllm, todo(wxs) + class AscendMoERunner(DefaultMoERunner): + """ + Default implementation of the MoE runner for executing Mixture of Experts layers. + + This class provides a comprehensive implementation for running MoE computations + with support for: + - Expert routing and token dispatching + - Shared experts computation with optional parallel execution using CUDA streams + - Data parallel (DP) chunking for large batch processing + - Tensor model parallel and expert parallel operations + - Various quantization methods and custom operators + - Both monolithic and decomposed expert execution paths + + The runner handles the complete MoE forward pass including routing tokens to + experts, executing expert computations, and combining results. It supports + advanced features like overlapped execution of shared experts and optimized + kernels for different parallel execution modes. + + Eventually, this class will be split up and specialized for different + configurations, e.g. the presence or absence of shared experts, a gate, etc. + """ + + def __init__( + self, + layer: torch.nn.Module, + moe_config: FusedMoEConfig, + router: FusedMoERouter, + routed_input_transform: torch.nn.Module | None, + gate: torch.nn.Module | None, + shared_experts: torch.nn.Module | None, + quant_method: FusedMoEMethodBase, + reduce_results: bool, + enable_dbo: bool, + ): + super().__init__( + layer, + moe_config, + router, + routed_input_transform, + gate, + shared_experts, + quant_method, + reduce_results, + enable_dbo, + ) + if self.shared_experts is None: + self.moe_forward = torch.ops.vllm.moe_forward + else: + self.moe_forward = torch.ops.vllm.moe_forward_shared + + def forward_impl( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + ): + """ + Override the default forward_impl to use Ascend-specific implementation. + This delegates to the layer's forward_impl method which contains the + Ascend-specific MoE computation logic. + """ + result = layer.forward_impl(hidden_states, router_logits) + # If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out) + # Otherwise, it returns just routed_out + # The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared + return result + + class AscendFusedMoE(FusedMoE): moe_counter = -1 gate_stream: torch.npu.Stream | None = None @@ -237,6 +315,26 @@ def __init__(self, *args, **kwargs): setup_moe_comm_method(self.moe_config) self.quant_type = self._get_quant_type() + if not vllm_version_is("0.15.0"): + self.runner = self._init_runner() + + if not vllm_version_is("0.15.0"): + + def _init_runner(self): + # Storing the runner in the FusedMoE is an intermediate state, eventually + # the runner will own the FusedMoE layer and provide the execution interface + # for MoE ops. + return AscendMoERunner( + layer=self, + moe_config=self.moe_config, + router=self.router, + routed_input_transform=self._routed_input_transform, + gate=self.gate, + shared_experts=self.shared_experts, + quant_method=self.quant_method, + reduce_results=self.reduce_results, + enable_dbo=self.vllm_config.parallel_config.enable_dbo, + ) def _get_quant_type(self) -> QuantType: quant_type = QuantType.NONE @@ -266,6 +364,19 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens """ return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states) + if not vllm_version_is("0.15.0"): + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + self.ensure_moe_quant_config_init() + return self.runner.forward( + hidden_states, + router_logits, + ) + def forward_impl( # type: ignore[override] self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False ) -> torch.Tensor | FusedMoEResult: @@ -414,6 +525,10 @@ def __init__( logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.") self._gate = gate + if not vllm_version_is("0.15.0"): + # Recreate the runner with the correct shared_experts parameter + # The parent class created the runner before self._shared_experts was set + self.runner = self._init_runner() if self.multistream_overlap_shared_expert: # Wrap the quant_method's process_weights_after_loading to validate that diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 41f1a169499..b7fced6e925 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -524,6 +524,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: "increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV." ) + from vllm_ascend.utils import vllm_version_is + + if vllm_version_is("0.15.0"): + arch_name = vllm_config.model_config.architectures[0] + else: + arch_name = vllm_config.model_config.architecture + # If original sizes exceed maximum, sample a representative subset if max_num_batch_sizes < len(original_sizes): # Sample uniformly from original sizes @@ -535,10 +542,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: sampled_sizes = [original_sizes[i] for i in indices] update_cudagraph_capture_sizes(vllm_config, sampled_sizes) - logger.info( "Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes", - vllm_config.model_config.architectures[0], + arch_name, num_hidden_layers, len(original_sizes), len( @@ -550,7 +556,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None: compilation_config.cudagraph_capture_sizes = original_sizes logger.info( "No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes", - vllm_config.model_config.architectures[0], + arch_name, num_hidden_layers, len(original_sizes), )