diff --git a/.github/workflows/scripts/config.yaml b/.github/workflows/scripts/config.yaml index acca617cc6d..c649f8ac8c7 100644 --- a/.github/workflows/scripts/config.yaml +++ b/.github/workflows/scripts/config.yaml @@ -97,6 +97,8 @@ e2e-multicard-2-cards: estimated_time: 400 - name: tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py estimated_time: 60 + - name: tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py + estimated_time: 223 # Run the test in a separate step to avoid oom - name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_multistream_moe_tp2 estimated_time: 100 diff --git a/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py b/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py new file mode 100755 index 00000000000..06bb5065362 --- /dev/null +++ b/tests/e2e/multicard/2-cards/test_llama32_lora_tp2.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from tests.e2e.conftest import VllmRunner, wait_until_npu_memory_free +from tests.e2e.singlecard.test_llama32_lora import generate_and_test +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() + +# For hk region, we need to use the model from hf to avoid the network issue +MODEL_PATH = "vllm-ascend/Llama-3.2-3B-Instruct" + + +@pytest.mark.parametrize("fully_sharded_loras", [False, True]) +@wait_until_npu_memory_free() +def test_llama_lora_tp2(llama32_lora_files, fully_sharded_loras): + with VllmRunner( + MODEL_PATH, + enable_lora=True, + # also test odd max_num_seqs + max_num_seqs=7, + max_model_len=1024, + max_loras=4, + tensor_parallel_size=2, + fully_sharded_loras=fully_sharded_loras, + ) as vllm_model: + llm = vllm_model.model + generate_and_test(llm, llama32_lora_files) diff --git a/vllm_ascend/lora/punica_npu.py b/vllm_ascend/lora/punica_npu.py old mode 100644 new mode 100755 index 885c0765705..1ae9ac97d5d --- a/vllm_ascend/lora/punica_npu.py +++ b/vllm_ascend/lora/punica_npu.py @@ -205,7 +205,6 @@ def add_expand( y: torch.Tensor, x: tuple[torch.Tensor, ...] | torch.Tensor, lora_b_stacked: tuple[torch.Tensor, ...], - lora_bias_stacked: tuple[torch.Tensor, ...] | None, output_slices: tuple[int, ...], offset_start: int = 0, add_inputs=True, @@ -217,24 +216,20 @@ def add_expand( Semantics: for i in range(len(lora_b_stacked)): slice = output_slices[i] - y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + - lora_bias_stacked[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] offset += slice Args: y (torch.Tensor): Output tensor. x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight - lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): - bias's weight output_slices (Tuple[int, ...]): Every slice's size + offset_start (int): The starting position of y, defaults to 0 add_inputs (bool): Defaults to True. """ y_org = y y = y.view(-1, y.shape[-1]) offset_left = offset_start - if lora_bias_stacked is not None: - self._apply_bias(self.token_lora_indices, y, output_slices, lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): self._apply_expand( y, @@ -313,7 +308,7 @@ def add_lora_linear( torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) for _ in range(len(output_slices)) ) self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) - self.add_expand(y, buffer, lora_b_stacked, None, output_slices, add_inputs=True, **kwargs) + self.add_expand(y, buffer, lora_b_stacked, output_slices, add_inputs=True, **kwargs) def add_lora_logits( self, diff --git a/vllm_ascend/lora/utils.py b/vllm_ascend/lora/utils.py old mode 100644 new mode 100755 index a0178560303..341a3ab4ba1 --- a/vllm_ascend/lora/utils.py +++ b/vllm_ascend/lora/utils.py @@ -4,13 +4,18 @@ from vllm.config import LoRAConfig from vllm.lora.layers import ( ColumnParallelLinearWithLoRA, + ColumnParallelLinearWithShardedLoRA, MergedColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithLoRA, + MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithLoRA, + QKVParallelLinearWithShardedLoRA, RowParallelLinearWithLoRA, + RowParallelLinearWithShardedLoRA, VocabParallelEmbeddingWithLoRA, ) -from vllm.lora.layers.utils import _not_fully_sharded_can_replace +from vllm.lora.layers.utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace from vllm_ascend.ops.linear import ( AscendColumnParallelLinear, @@ -23,6 +28,7 @@ class AscendColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): @classmethod + @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module, @@ -35,6 +41,7 @@ def can_replace_layer( class AscendMergedColumnParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA): @classmethod + @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module, @@ -47,6 +54,7 @@ def can_replace_layer( class AscendRowParallelLinearWithLoRA(RowParallelLinearWithLoRA): @classmethod + @_not_fully_sharded_can_replace def can_replace_layer( cls, source_layer: nn.Module, @@ -95,6 +103,71 @@ def can_replace_layer( return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3 +class AscendColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithShardedLoRA): + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendColumnParallelLinear + + +class AscendMergedColumnParallelLinearWithShardedLoRA(MergedColumnParallelLinearWithShardedLoRA): + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendMergedColumnParallelLinear + + +class AscendMergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithShardedLoRA): + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 3 + + +class AscendQKVParallelLinearWithShardedLoRA(QKVParallelLinearWithShardedLoRA): + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendQKVParallelLinear and len(packed_modules_list) == 1 + + +class AscendRowParallelLinearWithShardedLoRA(RowParallelLinearWithShardedLoRA): + @classmethod + @_fully_sharded_can_replace + def can_replace_layer( + cls, + source_layer: nn.Module, + lora_config: LoRAConfig, + packed_modules_list: list, + model_config: PretrainedConfig | None = None, + ) -> bool: + return type(source_layer) is AscendRowParallelLinear + + def refresh_all_lora_classes(): vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithLoRA) vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithLoRA) @@ -102,3 +175,8 @@ def refresh_all_lora_classes(): vllm.lora.utils._all_lora_classes.add(AscendVocabParallelEmbeddingWithLoRA) vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithLoRA) vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithLoRA) + vllm.lora.utils._all_lora_classes.add(AscendColumnParallelLinearWithShardedLoRA) + vllm.lora.utils._all_lora_classes.add(AscendMergedColumnParallelLinearWithShardedLoRA) + vllm.lora.utils._all_lora_classes.add(AscendMergedQKVParallelLinearWithShardedLoRA) + vllm.lora.utils._all_lora_classes.add(AscendQKVParallelLinearWithShardedLoRA) + vllm.lora.utils._all_lora_classes.add(AscendRowParallelLinearWithShardedLoRA)