Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions tests/e2e/singlecard/test_llama32_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from unittest.mock import patch

import pytest
import vllm
import vllm.config
from vllm.lora.request import LoRARequest
Expand Down Expand Up @@ -126,7 +125,6 @@ def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | No
print("removing lora")


@pytest.mark.skip(reason="fix me")
@patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
def test_llama_lora(llama32_lora_files):
vllm_model = VllmRunner(
Expand Down
19 changes: 16 additions & 3 deletions vllm_ascend/worker/model_runner_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1848,6 +1848,7 @@ def _determine_batch_execution_and_padding(
# be improved in model runner v2)
force_uniform_decode: bool | None = None,
force_has_lora: bool | None = None,
force_num_active_loras: int | None = None,
num_encoder_reqs: int = 0,
) -> tuple[CUDAGraphMode, BatchDescriptor, bool, torch.Tensor | None, CUDAGraphStat | None]:
num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
Expand All @@ -1864,7 +1865,12 @@ def _determine_batch_execution_and_padding(
# Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
# is present). Also, chunked-prefill is disabled, so batch are uniform.
has_encoder_output = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora
num_active_loras = (
force_num_active_loras
if force_num_active_loras is not None
else len(self.input_batch.lora_id_to_lora_request)
)
has_lora = num_active_loras > 0 if force_has_lora is None else force_has_lora

# ruff: noqa: E731
def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
Expand All @@ -1877,6 +1883,7 @@ def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
has_lora=has_lora,
uniform_decode=uniform_decode,
disable_full=disable_full,
num_active_loras=num_active_loras,
)
else:
return self.cudagraph_dispatcher.dispatch(
Expand All @@ -1885,6 +1892,7 @@ def dispatch_cudagraph(num_tokens, disable_full=False, valid_modes=None):
uniform_decode=uniform_decode,
valid_modes=valid_modes,
invalid_modes={CUDAGraphMode.FULL} if disable_full else None,
num_active_loras=num_active_loras,
)

cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded, use_cascade_attn or has_encoder_output)
Expand Down Expand Up @@ -2182,7 +2190,6 @@ def _dummy_run(
allow_microbatching: bool = True,
skip_eplb: bool = False,
remove_lora: bool = True,
activate_lora: bool = False,
is_graph_capturing: bool = False,
num_active_loras: int = 0,
) -> tuple[torch.Tensor, torch.Tensor]:
Expand Down Expand Up @@ -2246,7 +2253,8 @@ def _dummy_run(
# `force_has_lora` is used for cudagraph capture; because LoRA is
# activated later in the context manager, but we need to know the
# LoRA state when determining the batch descriptor for capture
force_has_lora=activate_lora,
force_has_lora=num_active_loras > 0,
force_num_active_loras=num_active_loras,
)
if self.use_cp:
self.pcp_manager.init_batch_info(
Expand Down Expand Up @@ -2321,6 +2329,11 @@ def _dummy_run(
self.lora_config,
num_scheduled_tokens,
num_sampled_tokens,
remove_lora,
# TODO: The next line is a temporary workaround
# to fix the accuracy issue of test_llama32_lora.py,
# which is introduced by vllm-project/vllm#32005
num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras),
):
# Make sure padding doesn't exceed max_num_tokens
assert num_tokens_padded <= self.max_num_tokens
Expand Down