Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ The new format of `--lora-modules` is mainly to support the display of parent mo

## LoRA Support for Tower and Connector of Multi-Modal Model

Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector.
Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. Please refer to [Issue 31479](https://github.com/vllm-project/vllm/issues/31479) to check the current model support status.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, add MM LoRA support status here


## Default LoRA Models For Multimodal Models

Expand Down
10 changes: 7 additions & 3 deletions vllm/lora/ops/triton_ops/fused_moe_lora_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,9 @@ def _fused_moe_lora_shrink(
num_stages: int,
split_k: int,
mul_routed_weight: bool = False,
use_gdc: bool = False,
) -> None:
w1_lora_a_stacked = lora_a_stacked[0]
use_gdc = supports_pdl(qcurr_hidden_states.device)
shrink_config = {
"BLOCK_SIZE_M": block_size_m,
"BLOCK_SIZE_N": block_size_n,
Expand Down Expand Up @@ -326,6 +326,7 @@ def _fused_moe_lora_expand(
split_k: int,
mul_routed_weight: bool = False,
offset: int = 0,
use_gdc: bool = False,
) -> None:
b_ptr = _get_ptr(lora_b_stacked, device)
K = max_lora_rank
Expand All @@ -337,7 +338,6 @@ def _fused_moe_lora_expand(
-1, a_intermediate_cache1.shape[3]
)

use_gdc = supports_pdl(a_intermediate_cache1.device)
expand_config = {
"BLOCK_SIZE_M": block_size_m,
"BLOCK_SIZE_N": block_size_n,
Expand Down Expand Up @@ -466,7 +466,7 @@ def _fused_moe_lora(
dtype=output.dtype,
device=device,
)

use_gdc = supports_pdl(device) and not fully_sharded
_fused_moe_lora_shrink(
a_intermediate_cache1,
qcurr_hidden_states,
Expand Down Expand Up @@ -495,6 +495,7 @@ def _fused_moe_lora(
shrink_num_stages,
shrink_split_k,
mul_routed_weight,
use_gdc=use_gdc,
)

if fully_sharded:
Expand Down Expand Up @@ -542,6 +543,7 @@ def _fused_moe_lora(
expand_split_k,
mul_routed_weight,
offset,
use_gdc=use_gdc,
)


Expand Down Expand Up @@ -604,6 +606,7 @@ def _fused_moe_lora_shrink_fake(
num_stages: int,
split_k: int,
mul_routed_weight: bool = False,
use_gdc: bool = False,
) -> None:
return

Expand Down Expand Up @@ -637,6 +640,7 @@ def _fused_moe_lora_expand_fake(
num_stages: int,
split_k: int,
mul_routed_weight: bool = False,
use_gdc: bool = False,
) -> None:
return

Expand Down
6 changes: 3 additions & 3 deletions vllm/lora/ops/triton_ops/lora_expand_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from vllm.triton_utils import tl, triton
from vllm.utils.torch_utils import direct_register_custom_op

from .utils import supports_pdl


@triton.jit
def _lora_expand_kernel(
Expand Down Expand Up @@ -241,7 +239,9 @@ def _lora_expand(
# thread blocks simply exit.
MAX_LORAS,
)
use_gdc = supports_pdl(inputs.device)
# We disable PDL temporarily because LoRA kernels are not launching back-to-back,
# making PDL invalid and affecting the kernel performance.
use_gdc = False # supports_pdl(inputs.device)
_lora_expand_kernel[grid](
inputs,
lora_ptr_tensor,
Expand Down
6 changes: 3 additions & 3 deletions vllm/lora/ops/triton_ops/lora_shrink_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
from vllm.triton_utils import tl, triton
from vllm.utils.torch_utils import direct_register_custom_op

from .utils import supports_pdl


@triton.jit
def _lora_shrink_kernel(
Expand Down Expand Up @@ -221,7 +219,9 @@ def _lora_shrink(
# thread blocks exit early.
MAX_LORAS,
)
use_gdc = supports_pdl(inputs.device)
# We disable PDL temporarily because LoRA kernels are not launching back-to-back,
# making PDL invalid and affecting the kernel performance.
use_gdc = False # supports_pdl(inputs.device)
_lora_shrink_kernel[grid](
inputs,
lora_ptr_tensor,
Expand Down