Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=9562912cead1f11e8540fb91306c5cbda66f0007
ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 9562912cead1f11e8540fb91306c5cbda66f0007
vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [9562912cead1f11e8540fb91306c5cbda66f0007]
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 9562912cead1f11e8540fb91306c5cbda66f0007, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ setuptools>=64
setuptools-scm>=8
torch==2.9.0
torchvision
torchaudio
wheel
xgrammar>=0.1.30
pandas-stubs
Expand Down
41 changes: 27 additions & 14 deletions tests/ut/eplb/core/test_eplb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,35 @@ def setUp(self, mock_fix_incompatible_config):
if vllm_version_is("0.15.0"):
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
is_sequence_parallel=False, enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
2, 0, 1, 2, 1, 1, 1, 1, 1, True, "hccl",
enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
hidden_dim=8192,
intermediate_size_per_partition=5,
num_local_experts=8,
num_logical_experts=8,
activation="silu",
device="npu",
routing_method=RoutingMethodType.Simulated,
moe_parallel_config=moe_parallel_config,
in_dtype=torch.float16,
)
moe_config.supports_eplb = True
self.vllm_config = vllm_config
self.moe_config = moe_config
Expand Down
14 changes: 7 additions & 7 deletions vllm_ascend/_310p/model_runner_310p.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,22 +236,22 @@ def _prepare_input_ids(
prev_draft_token_indices.extend(range(start, start + draft_len))
indices_match &= prev_index == flattened_index
max_flattened_index = max(max_flattened_index, flattened_index)
num_commmon_tokens = len(sample_flattened_indices)
num_common_tokens = len(sample_flattened_indices)
total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
if num_commmon_tokens < total_without_spec:
if num_common_tokens < total_without_spec:
self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
if self.enable_prompt_embeds:
self.inputs_embeds.copy_to_gpu(total_num_scheduled_tokens)
self.is_token_ids.copy_to_gpu(total_num_scheduled_tokens)
if num_commmon_tokens == 0:
if num_common_tokens == 0:
return
if indices_match and max_flattened_index == (num_commmon_tokens - 1):
if indices_match and max_flattened_index == (num_common_tokens - 1):
# NOTE: Override the copy_ function here
indices = torch.arange(num_commmon_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_commmon_tokens, 0]
indices = torch.arange(num_common_tokens, device=self.input_ids.gpu.device)
source = self.input_batch.prev_sampled_token_ids[:num_common_tokens, 0]
self.input_ids.gpu.index_copy_(0, indices, source)
if self.enable_prompt_embeds:
self.is_token_ids.gpu[:num_commmon_tokens] = True
self.is_token_ids.gpu[:num_common_tokens] = True
return
# Upload the index tensors asynchronously so the scatter can be non-blocking.
sampled_tokens_index_tensor = torch.tensor(
Expand Down
115 changes: 115 additions & 0 deletions vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE

from vllm_ascend.utils import vllm_version_is

if not vllm_version_is("0.15.0"):
from vllm.model_executor.layers.fused_moe.fused_moe_method_base import FusedMoEMethodBase # type: ignore
from vllm.model_executor.layers.fused_moe.router.fused_moe_router import FusedMoERouter # type: ignore
from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import DefaultMoERunner # type: ignore

from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.ascend_forward_context import MoECommType
from vllm_ascend.distributed.parallel_state import get_mc2_group
Expand Down Expand Up @@ -154,6 +161,77 @@ def apply(
return final_hidden_states


if not vllm_version_is("0.15.0"):
# Please remove this inheritance after extending vllm, todo(wxs)
class AscendMoERunner(DefaultMoERunner):
"""
Default implementation of the MoE runner for executing Mixture of Experts layers.

This class provides a comprehensive implementation for running MoE computations
with support for:
- Expert routing and token dispatching
- Shared experts computation with optional parallel execution using CUDA streams
- Data parallel (DP) chunking for large batch processing
- Tensor model parallel and expert parallel operations
- Various quantization methods and custom operators
- Both monolithic and decomposed expert execution paths

The runner handles the complete MoE forward pass including routing tokens to
experts, executing expert computations, and combining results. It supports
advanced features like overlapped execution of shared experts and optimized
kernels for different parallel execution modes.

Eventually, this class will be split up and specialized for different
configurations, e.g. the presence or absence of shared experts, a gate, etc.
"""

def __init__(
self,
layer: torch.nn.Module,
moe_config: FusedMoEConfig,
router: FusedMoERouter,
routed_input_transform: torch.nn.Module | None,
gate: torch.nn.Module | None,
shared_experts: torch.nn.Module | None,
quant_method: FusedMoEMethodBase,
reduce_results: bool,
enable_dbo: bool,
):
super().__init__(
layer,
moe_config,
router,
routed_input_transform,
gate,
shared_experts,
quant_method,
reduce_results,
enable_dbo,
)
if self.shared_experts is None:
self.moe_forward = torch.ops.vllm.moe_forward
else:
self.moe_forward = torch.ops.vllm.moe_forward_shared

def forward_impl(
self,
layer: torch.nn.Module,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
shared_input: torch.Tensor | None,
):
"""
Override the default forward_impl to use Ascend-specific implementation.
This delegates to the layer's forward_impl method which contains the
Ascend-specific MoE computation logic.
"""
result = layer.forward_impl(hidden_states, router_logits)
# If the layer has shared experts, forward_impl returns a tuple (shared_out, routed_out)
# Otherwise, it returns just routed_out
# The torch op expects the same return type based on whether it's moe_forward or moe_forward_shared
return result


class AscendFusedMoE(FusedMoE):
moe_counter = -1
gate_stream: torch.npu.Stream | None = None
Expand Down Expand Up @@ -237,6 +315,26 @@ def __init__(self, *args, **kwargs):

setup_moe_comm_method(self.moe_config)
self.quant_type = self._get_quant_type()
if not vllm_version_is("0.15.0"):
self.runner = self._init_runner()

if not vllm_version_is("0.15.0"):

def _init_runner(self):
# Storing the runner in the FusedMoE is an intermediate state, eventually
# the runner will own the FusedMoE layer and provide the execution interface
# for MoE ops.
return AscendMoERunner(
layer=self,
moe_config=self.moe_config,
router=self.router,
routed_input_transform=self._routed_input_transform,
gate=self.gate,
shared_experts=self.shared_experts,
quant_method=self.quant_method,
reduce_results=self.reduce_results,
enable_dbo=self.vllm_config.parallel_config.enable_dbo,
)

def _get_quant_type(self) -> QuantType:
quant_type = QuantType.NONE
Expand Down Expand Up @@ -266,6 +364,19 @@ def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tens
"""
return torch.ops.vllm.maybe_all_reduce_tensor_model_parallel(final_hidden_states)

if not vllm_version_is("0.15.0"):

def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
self.ensure_moe_quant_config_init()
return self.runner.forward(
hidden_states,
router_logits,
)

def forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor, return_with_event: bool = False
) -> torch.Tensor | FusedMoEResult:
Expand Down Expand Up @@ -414,6 +525,10 @@ def __init__(
logger.info_once("Sequence parallelism is enabled, shared experts are replicated for best performance.")

self._gate = gate
if not vllm_version_is("0.15.0"):
# Recreate the runner with the correct shared_experts parameter
# The parent class created the runner before self._shared_experts was set
self.runner = self._init_runner()

if self.multistream_overlap_shared_expert:
# Wrap the quant_method's process_weights_after_loading to validate that
Expand Down
12 changes: 9 additions & 3 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,13 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
"increase the number of supported shapes, set HCCL_OP_EXPANSION_MODE=AIV."
)

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.15.0"):
arch_name = vllm_config.model_config.architectures[0]
else:
arch_name = vllm_config.model_config.architecture

# If original sizes exceed maximum, sample a representative subset
if max_num_batch_sizes < len(original_sizes):
# Sample uniformly from original sizes
Expand All @@ -535,10 +542,9 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:

sampled_sizes = [original_sizes[i] for i in indices]
update_cudagraph_capture_sizes(vllm_config, sampled_sizes)

logger.info(
"Adjusted ACL graph batch sizes for %s model (layers: %d): %d → %d sizes",
vllm_config.model_config.architectures[0],
arch_name,
num_hidden_layers,
len(original_sizes),
len(
Expand All @@ -550,7 +556,7 @@ def update_aclgraph_sizes(vllm_config: VllmConfig) -> None:
compilation_config.cudagraph_capture_sizes = original_sizes
logger.info(
"No adjustment needed for ACL graph batch sizes: %s model (layers: %d) with %d sizes",
vllm_config.model_config.architectures[0],
arch_name,
num_hidden_layers,
len(original_sizes),
)
Expand Down