Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
91 commits
Select commit Hold shift + click to select a range
e7026a7
add mm_punica_warpper
linitra24 May 21, 2025
98debc2
Merge branch 'main' of https://github.com/B-201/vllm into v1-support-…
linitra24 Jul 2, 2025
7db0d59
update mm filter
linitra24 Jul 9, 2025
e2caeb3
Merge branch 'main' of https://github.com/B-201/vllm into v1-support-…
linitra24 Jul 9, 2025
891df1d
update
linitra24 Oct 11, 2025
236e0fe
update
linitra24 Oct 11, 2025
cb1a6f0
update
linitra24 Oct 13, 2025
0fa9111
update
linitra24 Oct 13, 2025
882213c
update
linitra24 Oct 13, 2025
8863bd2
update
linitra24 Oct 13, 2025
5c156c9
Init
jeejeelee Oct 13, 2025
a69bde7
[feat] add connector support
prashanth058 Nov 20, 2025
112779f
Merge pull request #7 from prashanth058/mlm-connector-support
linitra24 Nov 20, 2025
ee0cdef
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Nov 25, 2025
a364787
fix pre-commit
linitra24 Nov 25, 2025
8157363
qwen2.5 & 3 vl fixes and tests
prashanth058 Nov 25, 2025
7e710bc
Merge pull request #9 from prashanth058/vision-lora-fixes
linitra24 Nov 26, 2025
181b5f8
remove redundant assingments
prashanth058 Nov 26, 2025
9d41f6e
Merge pull request #10 from prashanth058/lora-vision-misc-fixes
linitra24 Nov 26, 2025
92ed13c
fix bug
linitra24 Nov 26, 2025
dd91286
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Nov 26, 2025
83556e9
Address conflict
jeejeelee Dec 3, 2025
e635861
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 3, 2025
e6784ed
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
linitra24 Dec 3, 2025
c0cc07e
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Dec 3, 2025
c94cdf1
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 4, 2025
598052b
fix bug
linitra24 Dec 4, 2025
f67ccfa
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
linitra24 Dec 4, 2025
1745bb9
address the ci issue
linitra24 Dec 5, 2025
113eb2e
add a enable option
linitra24 Dec 5, 2025
1fbd728
Merge branch 'main' into mlm-full-lora-support
linitra24 Dec 5, 2025
3e33423
address ci issue
linitra24 Dec 5, 2025
5ff0c6f
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Dec 10, 2025
f6a1357
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 10, 2025
d3c2f3d
address ci issue
linitra24 Dec 10, 2025
1d2c539
address ci issue
linitra24 Dec 10, 2025
bbd90e8
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
linitra24 Dec 10, 2025
e2ea025
address pre-commit & ci issue
linitra24 Dec 11, 2025
d1307e1
Merge branch 'main' into mlm-full-lora-support
linitra24 Dec 11, 2025
2744849
update argument name
linitra24 Dec 11, 2025
65e403d
remove outdated comment
linitra24 Dec 11, 2025
dd857e4
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
linitra24 Dec 11, 2025
e10321b
Merge branch 'main' into mlm-full-lora-support
linitra24 Dec 11, 2025
208dc0c
Fix comments
jeejeelee Dec 12, 2025
d4f39dc
Move forward
jeejeelee Dec 12, 2025
0642610
Move forward
jeejeelee Dec 12, 2025
5e78570
update packed modules mapping (#11)
prashanth058 Dec 12, 2025
1cb3546
Move forward
jeejeelee Dec 12, 2025
421707d
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 12, 2025
35acd22
Move forward
jeejeelee Dec 12, 2025
6a3f0a5
fix the issue with the MM token count
linitra24 Dec 13, 2025
c1bb71e
fix pre-commit
linitra24 Dec 13, 2025
58d2c47
update punica_wrapper_mapping
linitra24 Dec 15, 2025
bdac2b5
Merge branch 'main' into mlm-full-lora-support
linitra24 Dec 16, 2025
5791781
fix bug
linitra24 Dec 16, 2025
94dce5c
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 16, 2025
da0adea
added abstract methods to the base class
Anexdeus Dec 16, 2025
36121c6
fixed property bug in processor and added abstract methods in BasePro…
Anexdeus Dec 16, 2025
3d39188
Fix
jeejeelee Dec 17, 2025
fe104bd
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 17, 2025
1c8e3c4
fix pre-commit
linitra24 Dec 17, 2025
df3ec22
remove hacky code
linitra24 Dec 18, 2025
764aa45
fix bug
linitra24 Dec 19, 2025
463074f
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 20, 2025
d053aa7
Fix
jeejeelee Dec 20, 2025
9c9950c
fix
linitra24 Dec 20, 2025
4c2e95a
correct f-string formatting
linitra24 Dec 20, 2025
e5ba472
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 20, 2025
b03d1a0
added ProcessingInfoMixin for QwenVL series models
Anexdeus Dec 20, 2025
d525556
Revert the mixin changes
Anexdeus Dec 20, 2025
cd32aea
Merge branch 'jeejeelee:mlm-full-lora-support' into mlm-full-lora-sup…
Anexdeus Dec 20, 2025
c6831e7
extended SupportsMultiModal
Anexdeus Dec 20, 2025
68116ed
fix bug
linitra24 Dec 20, 2025
cb72a0e
fix pre-commit
linitra24 Dec 20, 2025
2b03137
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
Anexdeus Dec 20, 2025
86c6c5c
removed get_allowed_mm_limits() from models
Anexdeus Dec 20, 2025
a3a8fc1
Merge pull request #12 from Anexdeus/mlm-full-lora-support
linitra24 Dec 21, 2025
2040209
move mm-token-functions to model
linitra24 Dec 21, 2025
81b5ace
revert lora_kwargs change
linitra24 Dec 21, 2025
fa6dd85
fix
linitra24 Dec 21, 2025
8aedddd
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 21, 2025
f3a55ff
fix mm_hash
linitra24 Dec 22, 2025
f114b4e
disable mm cache when enable_tower_connector_lora
linitra24 Dec 23, 2025
a9b9af9
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Dec 23, 2025
390ac9a
fix pre-commit
linitra24 Dec 23, 2025
f9ca685
Merge remote-tracking branch 'origin/main' into mlm-full-lora-support
linitra24 Dec 24, 2025
ac8afb6
fix pre-commit
linitra24 Dec 24, 2025
57d7267
cleanup
linitra24 Dec 24, 2025
b275e8c
Merge branch 'main' into mlm-full-lora-support
jeejeelee Dec 26, 2025
7183a2f
fix ci issue
linitra24 Dec 26, 2025
41a0492
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/…
linitra24 Dec 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
}
```

## LoRA Support for Tower and Connector of Multi-Modal Model

Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector.

## Default LoRA Models For Multimodal Models

Some models, e.g., [Granite Speech](https://huggingface.co/ibm-granite/granite-speech-3.3-8b) and [Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) multimodal, contain LoRA adapter(s) that are expected to always be applied when a given modality is present. This can be a bit tedious to manage with the above approaches, as it requires the user to send the `LoRARequest` (offline) or to filter requests between the base model and LoRA model (server) depending on the content of the request's multimodal data.
Expand Down
25 changes: 25 additions & 0 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,31 @@ def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")


@pytest.fixture(scope="session")
def qwen2vl_language_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")


@pytest.fixture(scope="session")
def qwen2vl_vision_tower_connector_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")


@pytest.fixture(scope="session")
def qwen2vl_vision_tower_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")


@pytest.fixture(scope="session")
def qwen25vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")


@pytest.fixture(scope="session")
def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")


@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
Expand Down
28 changes: 21 additions & 7 deletions tests/lora/test_lora_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from vllm.lora.lora_model import LoRAModel
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.model_manager import (
DEFAULT_LANGUAGE_WRAPPER_KEY,
LoRAMapping,
LoRAModelManager,
LRUCacheLoRAModelManager,
Expand Down Expand Up @@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.activate_adapter(2)
assert manager.lora_index_to_id[0] == 3
assert manager.lora_index_to_id[1] == 2

assert manager.device == device
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert hasattr(manager, "supported_lora_modules")
assert sorted(manager.supported_lora_modules) == [
"dense1",
Expand Down Expand Up @@ -278,8 +281,10 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.remove_adapter(3)
with pytest.raises(ValueError):
assert manager.pin_adapter(3)

assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device


Expand Down Expand Up @@ -402,7 +407,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.remove_oldest_adapter()

assert set(manager.list_adapters()) == {1}
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device


Expand Down Expand Up @@ -514,7 +522,10 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)

assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device


@pytest.mark.parametrize("device", DEVICES)
Expand Down Expand Up @@ -618,7 +629,10 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)

assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device


@pytest.mark.parametrize("device", DEVICES)
Expand Down
129 changes: 125 additions & 4 deletions tests/lora/test_qwen2vl.py → tests/lora/test_qwenvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ class TestConfig:
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
max_lora_rank: int = 32
enable_tower_connector_lora: bool = False
max_model_len: int = 8192
gpu_memory_utilization: float = 0.85
mm_processor_kwargs: dict[str, int] | None = None
mm_processor_cache_gb: float = 4

def __post_init__(self):
if self.mm_processor_kwargs is None:
Expand Down Expand Up @@ -48,8 +51,11 @@ def _initialize_llm(self) -> vllm.LLM:
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
enable_tower_connector_lora=self.config.enable_tower_connector_lora,
trust_remote_code=True,
gpu_memory_utilization=self.config.gpu_memory_utilization,
mm_processor_kwargs=self.config.mm_processor_kwargs,
mm_processor_cache_gb=self.config.mm_processor_cache_gb,
max_model_len=self.config.max_model_len,
)

Expand All @@ -58,6 +64,7 @@ def run_test(
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: int | None = None,
lora_name: str | None = None,
temperature: float = 0,
max_tokens: int = 5,
):
Expand All @@ -73,10 +80,11 @@ def run_test(
for asset in images
]

lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
lora_request = LoRARequest(
lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
)
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
generated_texts = [output.outputs[0].text.strip() for output in outputs]

# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(generated), (
Expand Down Expand Up @@ -127,6 +135,22 @@ def run_beam_search_test(
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]

EXPECTED_OUTPUTS_LANGUAGE = [
"A stop sign is shown in an Asian city, with buildings and a car in the "
"background.",
"The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
]

EXPECTED_OUTPUTS_VISION = [
"A stop sign in front of oriental buildings.",
"A tree with pink flowers in front of it and a blue sky behind the flowers.",
]

EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
"A stop sign is located on the street of a Chinese neighborhood.",
"A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
]

# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
Expand All @@ -137,6 +161,7 @@ def run_beam_search_test(

QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"


def test_qwen2vl_lora(qwen2vl_lora_files):
Expand Down Expand Up @@ -175,3 +200,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)


def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
config = TestConfig(
model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)


def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
config = TestConfig(
model_path=QWEN3VL_MODEL_PATH,
lora_path=qwen3vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)


def test_qwen2vl_multiple_lora_types(
qwen2vl_language_lora_files,
qwen2vl_vision_tower_connector_lora_files,
qwen2vl_vision_tower_lora_files,
):
"""
Test multiple LoRA adapter types (language, vision tower + connector,
vision tower only) using the same LLM instance to verify mm_encoder_cache
behavior with different LoRA requests.

By reusing the same LLM instance across different LoRA requests, we ensure that
the multimodal encoder cache correctly manages state transitions between
language-only and vision-enabled LoRA adapters.
"""
config = TestConfig(
model_path=QWEN2VL_MODEL_PATH,
# We'll override the lora_path for each specific test, but need to provide
# an initial path for initialization
lora_path=qwen2vl_language_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)

# Test 1: Language-only LoRA adapter
tester.config.lora_path = qwen2vl_language_lora_files
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
lora_id=lora_id,
lora_name="language_only",
)

# Test 2: Vision tower + connector LoRA adapter
tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
for lora_id in [3, 4]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION,
lora_id=lora_id,
lora_name="vision_tower_connector",
)

# Test 3: Vision tower only LoRA adapter (no connector)
tester.config.lora_path = qwen2vl_vision_tower_lora_files
for lora_id in [5, 6]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
lora_id=lora_id,
lora_name="vision_tower",
)
6 changes: 6 additions & 0 deletions vllm/config/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ class LoRAConfig:
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
enable_tower_connector_lora: bool = False
"""If `True`, LoRA support for the tower (vision encoder) and connector
of multimodal models will be enabled. This is an experimental feature and
currently only supports some MM models such as the Qwen VL series. The default
is False."""

def compute_hash(self) -> str:
"""
Expand All @@ -73,6 +78,7 @@ def compute_hash(self) -> str:
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.enable_tower_connector_lora)

hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
return hash_str
Expand Down
19 changes: 19 additions & 0 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,7 @@ class EngineArgs:
fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
enable_tower_connector_lora: bool = LoRAConfig.enable_tower_connector_lora

ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
Expand Down Expand Up @@ -996,6 +997,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
"--lora-dtype",
**lora_kwargs["lora_dtype"],
)
lora_group.add_argument(
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DITTO

"--enable-tower-connector-lora",
**lora_kwargs["enable_tower_connector_lora"],
)
lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"])
lora_group.add_argument(
"--fully-sharded-loras", **lora_kwargs["fully_sharded_loras"]
Expand Down Expand Up @@ -1631,6 +1636,7 @@ def create_engine_config(
default_mm_loras=self.default_mm_loras,
fully_sharded_loras=self.fully_sharded_loras,
lora_dtype=self.lora_dtype,
enable_tower_connector_lora=self.enable_tower_connector_lora,
max_cpu_loras=self.max_cpu_loras
if self.max_cpu_loras and self.max_cpu_loras > 0
else None,
Expand All @@ -1639,6 +1645,19 @@ def create_engine_config(
else None
)

if (
lora_config is not None
and lora_config.enable_tower_connector_lora
and self.mm_processor_cache_gb != 0
):
raise ValueError(
"Currently, enable_tower_connector_lora is "
"incompatible with the multi-modal processor cache. "
"When enable_tower_connector_lora is set, "
"mm_processor_cache_gb must be 0, got %s",
self.mm_processor_cache_gb,
)

if (
lora_config is not None
and speculative_config is not None
Expand Down
3 changes: 2 additions & 1 deletion vllm/lora/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
RowParallelLinearWithLoRA,
RowParallelLinearWithShardedLoRA,
)
from vllm.lora.layers.utils import LoRAMapping
from vllm.lora.layers.utils import LoRAMapping, LoRAMappingType
from vllm.lora.layers.vocal_parallel_embedding import VocabParallelEmbeddingWithLoRA

__all__ = [
Expand All @@ -37,6 +37,7 @@
"RowParallelLinearWithShardedLoRA",
"ReplicatedLinearWithLoRA",
"LoRAMapping",
"LoRAMappingType",
"FusedMoEWithLoRA",
"FusedMoE3DWithLoRA",
]
Loading