Skip to content
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e1b83e7
Init
jeejeelee Mar 2, 2026
193bbd1
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 2, 2026
ac348b2
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 2, 2026
ee4cbd8
Move forward
jeejeelee Mar 2, 2026
668066b
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 2, 2026
956a551
Fix
jeejeelee Mar 2, 2026
23a2758
Move
jeejeelee Mar 2, 2026
f09723b
Move
jeejeelee Mar 4, 2026
1ff9932
Merge branch 'vllm-project:main' into lora-dual-stream
jeejeelee Mar 5, 2026
d893daa
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 7, 2026
0f8a59d
Test
jeejeelee Mar 8, 2026
092b206
Address conflict
jeejeelee Mar 8, 2026
76898e0
Address conflict
jeejeelee Mar 8, 2026
3dd4957
Merge remote-tracking branch 'origin/main' into lora-dual-stream
jeejeelee Mar 10, 2026
de098b0
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 11, 2026
ae6a597
Address conflict
jeejeelee Mar 20, 2026
90a5ede
Merge remote-tracking branch 'origin/main' into lora-dual-stream
jeejeelee Mar 20, 2026
f38ca3c
OPT
jeejeelee Mar 20, 2026
21f9c81
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 20, 2026
af9bb12
FIX
jeejeelee Mar 30, 2026
551d295
FIX
jeejeelee Mar 30, 2026
97fea1a
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 30, 2026
bd8b5a9
FIX
jeejeelee Mar 30, 2026
6abfcfd
FIX
jeejeelee Mar 30, 2026
b2b437b
FIX
jeejeelee Mar 30, 2026
9d8515b
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 30, 2026
66a734c
FIX
jeejeelee Mar 31, 2026
7077887
Reveet
jeejeelee Mar 31, 2026
fd759ae
FMT
jeejeelee Mar 31, 2026
13a3d23
FIX
jeejeelee Mar 31, 2026
691e0f4
Merge branch 'main' into lora-dual-stream
jeejeelee Mar 31, 2026
a57dedb
Update
jeejeelee Apr 3, 2026
cfd110b
Merge branch 'main' into lora-dual-stream
jeejeelee Apr 3, 2026
619703a
Merge branch 'main' into lora-dual-stream
jeejeelee Apr 9, 2026
3e33425
Merge branch 'main' into lora-dual-stream
jeejeelee Apr 9, 2026
c2dbc3c
Merge branch 'main' into lora-dual-stream
jeejeelee Apr 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
cleanup_dist_env_and_memory(shutdown_ray=True)


@pytest.fixture
def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
if current_platform.is_cuda():
monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
yield
Comment thread
jeejeelee marked this conversation as resolved.


@pytest.fixture
def dist_init():
from tests.utils import ensure_current_vllm_config
Expand Down
58 changes: 43 additions & 15 deletions tests/lora/test_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,10 @@ def test_linear_replicated(
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)

def create_random_linear_replicated_layer():
linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
def create_random_linear_replicated_layer(idx: int = 0):
linear = ReplicatedLinear(
4096, 4096, bias=False, params_dtype=torch.float16, prefix=f"layer_{idx}"
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = ReplicatedLinearWithLoRA(linear)

Expand All @@ -539,7 +541,7 @@ def create_random_linear_replicated_layer():
set_random_seed(i)

id_to_index = get_random_id_to_index(num_loras, max_loras)
linear, lora_linear = create_random_linear_replicated_layer()
linear, lora_linear = create_random_linear_replicated_layer(i)
assert torch.equal(linear.weight, lora_linear.weight)
lora_linear.set_mapping(punica_wrapper)
lora_dict, _ = populate_loras(
Expand Down Expand Up @@ -629,10 +631,14 @@ def test_linear_parallel(
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)

def create_random_linear_parallel_layer():
def create_random_linear_parallel_layer(idx: int = 0):
if orientation == "row":
linear = RowParallelLinear(
4096, 4096, bias=False, params_dtype=torch.float16
4096,
4096,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (
Expand All @@ -642,7 +648,11 @@ def create_random_linear_parallel_layer():
)
else:
linear = ColumnParallelLinear(
4096, 4096, bias=False, params_dtype=torch.float16
4096,
4096,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (
Expand All @@ -664,7 +674,7 @@ def create_random_linear_parallel_layer():
set_random_seed(i)

id_to_index = get_random_id_to_index(num_loras, max_loras)
linear, lora_linear = create_random_linear_parallel_layer()
linear, lora_linear = create_random_linear_parallel_layer(i)
assert torch.equal(linear.weight, lora_linear.weight)
lora_linear.set_mapping(punica_wrapper)
lora_dict, _ = populate_loras(
Expand Down Expand Up @@ -754,10 +764,14 @@ def test_column_parallel_packed(
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)

def create_column_parallel_packed_layer():
def create_column_parallel_packed_layer(idx: int = 0):
if repeats == 2:
linear = MergedColumnParallelLinear(
4096, [4096] * repeats, bias=False, params_dtype=torch.float16
4096,
[4096] * repeats,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (
Expand All @@ -767,7 +781,12 @@ def create_column_parallel_packed_layer():
)
elif repeats == 3:
linear = QKVParallelLinear(
4096, 64, 32, bias=False, params_dtype=torch.float16
4096,
64,
32,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (
Expand All @@ -777,7 +796,12 @@ def create_column_parallel_packed_layer():
)
else:
linear = QKVParallelLinear(
4096, 64, 32, bias=False, params_dtype=torch.float16
4096,
64,
32,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = (
Expand Down Expand Up @@ -810,7 +834,7 @@ class FakeConfig:

id_to_index = get_random_id_to_index(num_loras, max_loras)

linear, lora_linear = create_column_parallel_packed_layer()
linear, lora_linear = create_column_parallel_packed_layer(i)
assert torch.equal(linear.weight, lora_linear.weight)
lora_linear.set_mapping(punica_wrapper)
lora_dict, sublora_dict = populate_loras(
Expand Down Expand Up @@ -902,10 +926,14 @@ def test_merged_column_parallel_variable_slice(
output_sizes = [1024 + i * 256 for i in range(num_slices)]
total_output = sum(output_sizes)

def create_layer():
def create_layer(idx: int = 0):
# Create linear layer
linear = MergedColumnParallelLinear(
4096, output_sizes, bias=False, params_dtype=torch.float16
4096,
output_sizes,
bias=False,
params_dtype=torch.float16,
prefix=f"layer_{idx}",
)
linear.weight.data = torch.rand_like(linear.weight.data)

Expand All @@ -917,7 +945,7 @@ def create_layer():
for i in range(NUM_RANDOM_SEEDS):
set_random_seed(i)
id_to_index = get_random_id_to_index(num_loras, max_loras)
linear, lora_linear = create_layer()
linear, lora_linear = create_layer(i)
lora_linear.set_mapping(punica_wrapper)

# Populate LoRA weights
Expand Down
6 changes: 4 additions & 2 deletions tests/lora/test_olmoe_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def generate_and_test(
)


def test_olmoe_lora(olmoe_lora_files):
def test_olmoe_lora(olmoe_lora_files, maybe_enable_lora_dual_stream):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
llm = vllm.LLM(
Expand Down Expand Up @@ -141,7 +141,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])


def test_olmoe_lora_mixed_random(olmoe_lora_files, tmp_path):
def test_olmoe_lora_mixed_random(
olmoe_lora_files, tmp_path, maybe_enable_lora_dual_stream
):
# Create a dummy LoRA with random weights based on the real one
random_lora_path = tmp_path / "random_lora"
shutil.copytree(olmoe_lora_files, random_lora_path)
Expand Down
8 changes: 6 additions & 2 deletions tests/lora/test_qwen35_densemodel_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,9 @@ def _assert_qwen35_text_vl_and_mixed_lora(


@create_new_process_for_each_test()
def test_qwen35_text_lora(qwen35_text_lora_files, qwen35_vl_lora_files):
def test_qwen35_text_lora(
qwen35_text_lora_files, qwen35_vl_lora_files, maybe_enable_lora_dual_stream
):
llm = vllm.LLM(
model=MODEL_PATH,
max_model_len=4096,
Expand All @@ -335,7 +337,9 @@ def test_qwen35_text_lora(qwen35_text_lora_files, qwen35_vl_lora_files):


@multi_gpu_test(num_gpus=4)
def test_qwen35_text_lora_tp4(qwen35_text_lora_files, qwen35_vl_lora_files):
def test_qwen35_text_lora_tp4(
qwen35_text_lora_files, qwen35_vl_lora_files, maybe_enable_lora_dual_stream
):
llm = vllm.LLM(
model=MODEL_PATH,
max_model_len=4096,
Expand Down
11 changes: 10 additions & 1 deletion vllm/config/lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from pydantic import ConfigDict, Field, model_validator
from typing_extensions import Self

from vllm import envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils.hashing import safe_hash

if TYPE_CHECKING:
Expand Down Expand Up @@ -105,7 +107,14 @@ def _validate_lora_config(self) -> Self:
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})."
)

if envs.VLLM_LORA_ENABLE_DUAL_STREAM and not current_platform.is_cuda_alike():
raise ValueError("Dual CUDA streams are only supported on CUDA platforms.")
if envs.VLLM_LORA_ENABLE_DUAL_STREAM and self.fully_sharded_loras:
logger.warning_once(
"fully_sharded_loras isn't compatible with "
"VLLM_LORA_ENABLE_DUAL_STREAM, set VLLM_LORA_ENABLE_DUAL_STREAM=False"
)
envs.VLLM_LORA_ENABLE_DUAL_STREAM = False
return self

def verify_with_model_config(self, model_config: ModelConfig):
Expand Down
Loading
Loading