From ab87af0ce364f27afdf6890cad50f81bf0ac0c61 Mon Sep 17 00:00:00 2001
From: paulyu12 <507435917@qq.com>
Date: Tue, 3 Mar 2026 16:48:14 +0800
Subject: [PATCH 1/5] [bugfix][LoRA] fix the lora accuracy issue

Signed-off-by: paulyu12 <507435917@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index b1d925a58df..cdf1db5e9ec 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -1734,6 +1734,7 @@ def _determine_batch_execution_and_padding(
         # be improved in model runner v2)
         force_uniform_decode: bool | None = None,
         force_has_lora: bool | None = None,
+        force_num_active_loras: int | None = None,
         num_encoder_reqs: int = 0,
     ) -> tuple[CUDAGraphMode, BatchDescriptor, bool, torch.Tensor | None, CUDAGraphStat | None]:
         num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
@@ -1748,7 +1749,12 @@ def _determine_batch_execution_and_padding(
         # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output
         # is present). Also, chunked-prefill is disabled, so batch are uniform.
         has_encoder_output = self.model_config.is_encoder_decoder and num_encoder_reqs > 0
-        has_lora = len(self.input_batch.lora_id_to_lora_request) > 0 if force_has_lora is None else force_has_lora
+        num_active_loras = (
+            force_num_active_loras
+            if force_num_active_loras is not None
+            else len(self.input_batch.lora_id_to_lora_request)
+        )
+        has_lora = num_active_loras > 0 if force_has_lora is None else force_has_lora
 
         # ruff: noqa: E731
         dispatch_cudagraph = (
@@ -1757,6 +1763,7 @@ def _determine_batch_execution_and_padding(
                 has_lora=has_lora,
                 uniform_decode=uniform_decode,
                 disable_full=disable_full,
+                num_active_loras=num_active_loras,
             )
             if not force_eager
             else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
@@ -2031,7 +2038,6 @@ def _dummy_run(
         allow_microbatching: bool = True,
         skip_eplb: bool = False,
         remove_lora: bool = True,
-        activate_lora: bool = False,
         is_graph_capturing: bool = False,
         num_active_loras: int = 0,
     ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -2092,7 +2098,8 @@ def _dummy_run(
             # `force_has_lora` is used for cudagraph capture; because LoRA is
             # activated later in the context manager, but we need to know the
             # LoRA state when determining the batch descriptor for capture
-            force_has_lora=activate_lora,
+            force_has_lora=num_active_loras > 0,
+            force_num_active_loras=num_active_loras,
         )
         if self.use_cp:
             self.pcp_manager.init_batch_info(
@@ -2167,6 +2174,12 @@ def _dummy_run(
             self.lora_config,
             num_scheduled_tokens,
             num_sampled_tokens,
+            remove_lora,
+            num_active_loras=(
+                self.lora_config.max_loras
+                if self.lora_config is not None
+                else num_active_loras
+            ),
         ):
             # Make sure padding doesn't exceed max_num_tokens
             assert num_tokens_padded <= self.max_num_tokens

From e49acd761d52623049c476302f9e2a657bb77928 Mon Sep 17 00:00:00 2001
From: paulyu12 <507435917@qq.com>
Date: Tue, 3 Mar 2026 01:13:40 -0800
Subject: [PATCH 2/5] [Bugfix][LoRA] fix

Signed-off-by: paulyu12 <507435917@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index cdf1db5e9ec..644771f37ad 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2175,11 +2175,7 @@ def _dummy_run(
             num_scheduled_tokens,
             num_sampled_tokens,
             remove_lora,
-            num_active_loras=(
-                self.lora_config.max_loras
-                if self.lora_config is not None
-                else num_active_loras
-            ),
+            num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras),
         ):
             # Make sure padding doesn't exceed max_num_tokens
             assert num_tokens_padded <= self.max_num_tokens

From 053d28030c60ea03f889c5b5a1caf0578697e88b Mon Sep 17 00:00:00 2001
From: paulyu12 <507435917@qq.com>
Date: Tue, 3 Mar 2026 01:26:09 -0800
Subject: [PATCH 3/5] [Bugfix][LoRA] fix

Signed-off-by: paulyu12 <507435917@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 644771f37ad..5466abbbcd2 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2175,6 +2175,9 @@ def _dummy_run(
             num_scheduled_tokens,
             num_sampled_tokens,
             remove_lora,
+            # TODO: The next line is a temporary workaround
+            # to fix the accuracy issue of test_llama32_lora.py, 
+            # which is introduced by vllm-project/vllm#32005
             num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras),
         ):
             # Make sure padding doesn't exceed max_num_tokens

From b9bd04f50089aa45ef5dddebc8b0588b5109645d Mon Sep 17 00:00:00 2001
From: paulyu12 <507435917@qq.com>
Date: Tue, 3 Mar 2026 16:50:31 -0800
Subject: [PATCH 4/5] [Bugfix][LoRA] lint

Signed-off-by: paulyu12 <507435917@qq.com>
---
 vllm_ascend/worker/model_runner_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
index 5466abbbcd2..c4060f0c5b4 100644
--- a/vllm_ascend/worker/model_runner_v1.py
+++ b/vllm_ascend/worker/model_runner_v1.py
@@ -2176,7 +2176,7 @@ def _dummy_run(
             num_sampled_tokens,
             remove_lora,
             # TODO: The next line is a temporary workaround
-            # to fix the accuracy issue of test_llama32_lora.py, 
+            # to fix the accuracy issue of test_llama32_lora.py,
             # which is introduced by vllm-project/vllm#32005
             num_active_loras=(self.lora_config.max_loras if self.lora_config is not None else num_active_loras),
         ):

From 2ce61cbdbefd117f81c9c458d5d34b87330efbdc Mon Sep 17 00:00:00 2001
From: paulyu12 <507435917@qq.com>
Date: Mon, 9 Mar 2026 03:16:51 +0000
Subject: [PATCH 5/5] [bugfix][LoRA] lint

Signed-off-by: paulyu12 <507435917@qq.com>
---
 tests/e2e/singlecard/test_llama32_lora.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/e2e/singlecard/test_llama32_lora.py b/tests/e2e/singlecard/test_llama32_lora.py
index ab7015b2aaa..9ce3e6849f8 100644
--- a/tests/e2e/singlecard/test_llama32_lora.py
+++ b/tests/e2e/singlecard/test_llama32_lora.py
@@ -3,7 +3,6 @@
 
 from unittest.mock import patch
 
-import pytest
 import vllm
 import vllm.config
 from vllm.lora.request import LoRARequest
@@ -126,7 +125,6 @@ def generate_and_test(llm, llama32_lora_files, tensorizer_config_dict: dict | No
     print("removing lora")
 
 
-@pytest.mark.skip(reason="fix me")
 @patch.dict("os.environ", {"VLLM_USE_MODELSCOPE": "False"})
 def test_llama_lora(llama32_lora_files):
     vllm_model = VllmRunner(