Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions tests/e2e/multicard/2-cards/test_full_graph_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#
import os

import pytest
from vllm import SamplingParams

from tests.e2e.conftest import VllmRunner
Expand Down Expand Up @@ -68,7 +67,6 @@ def test_qwen3_moe_full_decode_only_tp2():
)


@pytest.mark.skip(reason="CANN8.5 failed with this test, fix me")
def test_qwen3_moe_full_graph_tp2():
if "HCCL_OP_EXPANSION_MODE" in os.environ:
del os.environ["HCCL_OP_EXPANSION_MODE"]
Expand Down
24 changes: 20 additions & 4 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,11 +521,24 @@ def get_model(self) -> nn.Module:
return self.model.unwrap()
return self.model

def _pad_query_start_loc_for_fia(self, num_tokens_padded: int, num_reqs_padded: int, num_reqs: int) -> int:
def _pad_query_start_loc_for_fia(
self,
num_tokens_padded: int,
num_reqs_padded: int,
num_reqs: int,
cudagraph_runtime_mode: CUDAGraphMode | None = None,
batch_desc_num_reqs: int | None = None,
) -> int:
"""
This function is only designed to satisfied the constraint that when the layout is TND,
the first dimension of `hidden_states` must equal the last element of `actual_seq_lengths_q`.
"""
# TODO: need refactor later, related to vllm PR #34043 this pr delete func
# relax_for_mixed_batch_cudagraphs, num_reqs no longer equals the actual number of requests.
if cudagraph_runtime_mode == CUDAGraphMode.FULL:
num_reqs_padded = num_reqs
else:
num_reqs_padded = batch_desc_num_reqs if batch_desc_num_reqs is not None else num_reqs

if num_tokens_padded == num_reqs_padded * self.uniform_decode_query_len:
# Uniform-batch case: num_reqs must be no greater than num_reqs_padded
Expand Down Expand Up @@ -1218,7 +1231,9 @@ def execute_model(
# Another possible condition is num_tokens_padded != num_tokens_unpadded
# but this scope is way too big and the consequences are unpredictable
old_num_reqs_padded = num_reqs_padded
num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs)
num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_mode, batch_desc.num_reqs
)
if enable_sp() and num_tokens_padded == num_tokens_unpadded:
if num_reqs_padded > old_num_reqs_padded:
num_reqs_padded = old_num_reqs_padded
Expand Down Expand Up @@ -2324,8 +2339,9 @@ def _dummy_run(
cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
self.query_start_loc.np[1 : num_reqs_padded + 1] = cum_num_tokens
self.query_start_loc.copy_to_gpu()

num_reqs_padded = self._pad_query_start_loc_for_fia(num_tokens_padded, num_reqs_padded, num_reqs)
num_reqs_padded = self._pad_query_start_loc_for_fia(
num_tokens_padded, num_reqs_padded, num_reqs, cudagraph_runtime_mode, batch_desc.num_reqs
)

pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL
attn_metadata, _ = self._build_attention_metadata(
Expand Down
Loading