vllm-project · DarkLight1337 · Apr 13, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
@@ -43,6 +43,13 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup_dist_env_and_memory(shutdown_ray=True)
 
 
+@pytest.fixture
+def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
+    if current_platform.is_cuda():
+        monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
+    yield
+
+
 @pytest.fixture
 def dist_init():
     from tests.utils import ensure_current_vllm_config

@@ -521,8 +521,10 @@ def test_linear_replicated(
     punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
     assert check_punica_wrapper(punica_wrapper)
 
-    def create_random_linear_replicated_layer():
-        linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
+    def create_random_linear_replicated_layer(idx: int = 0):
+        linear = ReplicatedLinear(
+            4096, 4096, bias=False, params_dtype=torch.float16, prefix=f"layer_{idx}"
+        )
         linear.weight.data = torch.rand_like(linear.weight.data)
         lora_linear = ReplicatedLinearWithLoRA(linear)
 
@@ -539,7 +541,7 @@ def create_random_linear_replicated_layer():
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
-        linear, lora_linear = create_random_linear_replicated_layer()
+        linear, lora_linear = create_random_linear_replicated_layer(i)
         assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
@@ -629,10 +631,14 @@ def test_linear_parallel(
     punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
     assert check_punica_wrapper(punica_wrapper)
 
-    def create_random_linear_parallel_layer():
+    def create_random_linear_parallel_layer(idx: int = 0):
         if orientation == "row":
             linear = RowParallelLinear(
-                4096, 4096, bias=False, params_dtype=torch.float16
+                4096,
+                4096,
+                bias=False,
+                params_dtype=torch.float16,
+                prefix=f"layer_{idx}",
             )
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = (
@@ -642,7 +648,11 @@ def create_random_linear_parallel_layer():
             )
         else:
             linear = ColumnParallelLinear(
-                4096, 4096, bias=False, params_dtype=torch.float16
+                4096,
+                4096,
+                bias=False,
+                params_dtype=torch.float16,
+                prefix=f"layer_{idx}",
             )
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = (
@@ -664,7 +674,7 @@ def create_random_linear_parallel_layer():
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
-        linear, lora_linear = create_random_linear_parallel_layer()
+        linear, lora_linear = create_random_linear_parallel_layer(i)
         assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
@@ -754,10 +764,14 @@ def test_column_parallel_packed(
     punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
     assert check_punica_wrapper(punica_wrapper)
 
-    def create_column_parallel_packed_layer():
+    def create_column_parallel_packed_layer(idx: int = 0):
         if repeats == 2:
             linear = MergedColumnParallelLinear(
-                4096, [4096] * repeats, bias=False, params_dtype=torch.float16
+                4096,
+                [4096] * repeats,
+                bias=False,
+                params_dtype=torch.float16,
+                prefix=f"layer_{idx}",
             )
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = (
@@ -767,7 +781,12 @@ def create_column_parallel_packed_layer():
             )
         elif repeats == 3:
             linear = QKVParallelLinear(
-                4096, 64, 32, bias=False, params_dtype=torch.float16
+                4096,
+                64,
+                32,
+                bias=False,
+                params_dtype=torch.float16,
+                prefix=f"layer_{idx}",
             )
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = (
@@ -777,7 +796,12 @@ def create_column_parallel_packed_layer():
             )
         else:
             linear = QKVParallelLinear(
-                4096, 64, 32, bias=False, params_dtype=torch.float16
+                4096,
+                64,
+                32,
+                bias=False,
+                params_dtype=torch.float16,
+                prefix=f"layer_{idx}",
             )
             linear.weight.data = torch.rand_like(linear.weight.data)
             lora_linear = (
@@ -810,7 +834,7 @@ class FakeConfig:
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
-        linear, lora_linear = create_column_parallel_packed_layer()
+        linear, lora_linear = create_column_parallel_packed_layer(i)
         assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
@@ -902,10 +926,14 @@ def test_merged_column_parallel_variable_slice(
     output_sizes = [1024 + i * 256 for i in range(num_slices)]
     total_output = sum(output_sizes)
 
-    def create_layer():
+    def create_layer(idx: int = 0):
         # Create linear layer
         linear = MergedColumnParallelLinear(
-            4096, output_sizes, bias=False, params_dtype=torch.float16
+            4096,
+            output_sizes,
+            bias=False,
+            params_dtype=torch.float16,
+            prefix=f"layer_{idx}",
         )
         linear.weight.data = torch.rand_like(linear.weight.data)
 
@@ -917,7 +945,7 @@ def create_layer():
     for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
         id_to_index = get_random_id_to_index(num_loras, max_loras)
-        linear, lora_linear = create_layer()
+        linear, lora_linear = create_layer(i)
         lora_linear.set_mapping(punica_wrapper)
 
         # Populate LoRA weights

@@ -110,7 +110,7 @@ def generate_and_test(
         )
 
 
-def test_olmoe_lora(olmoe_lora_files):
+def test_olmoe_lora(olmoe_lora_files, maybe_enable_lora_dual_stream):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
     llm = vllm.LLM(
@@ -141,7 +141,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
 
 
-def test_olmoe_lora_mixed_random(olmoe_lora_files, tmp_path):
+def test_olmoe_lora_mixed_random(
+    olmoe_lora_files, tmp_path, maybe_enable_lora_dual_stream
+):
     # Create a dummy LoRA with random weights based on the real one
     random_lora_path = tmp_path / "random_lora"
     shutil.copytree(olmoe_lora_files, random_lora_path)

@@ -312,7 +312,9 @@ def _assert_qwen35_text_vl_and_mixed_lora(
 
 
 @create_new_process_for_each_test()
-def test_qwen35_text_lora(qwen35_text_lora_files, qwen35_vl_lora_files):
+def test_qwen35_text_lora(
+    qwen35_text_lora_files, qwen35_vl_lora_files, maybe_enable_lora_dual_stream
+):
     llm = vllm.LLM(
         model=MODEL_PATH,
         max_model_len=4096,
@@ -335,7 +337,9 @@ def test_qwen35_text_lora(qwen35_text_lora_files, qwen35_vl_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-def test_qwen35_text_lora_tp4(qwen35_text_lora_files, qwen35_vl_lora_files):
+def test_qwen35_text_lora_tp4(
+    qwen35_text_lora_files, qwen35_vl_lora_files, maybe_enable_lora_dual_stream
+):
     llm = vllm.LLM(
         model=MODEL_PATH,
         max_model_len=4096,

@@ -7,8 +7,10 @@
 from pydantic import ConfigDict, Field, model_validator
 from typing_extensions import Self
 
+from vllm import envs
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
@@ -105,7 +107,14 @@ def _validate_lora_config(self) -> Self:
                 f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
                 f"max_loras ({self.max_loras})."
             )
-
+        if envs.VLLM_LORA_ENABLE_DUAL_STREAM and not current_platform.is_cuda_alike():
+            raise ValueError("Dual CUDA streams are only supported on CUDA platforms.")
+        if envs.VLLM_LORA_ENABLE_DUAL_STREAM and self.fully_sharded_loras:
+            logger.warning_once(
+                "fully_sharded_loras isn't compatible with "
+                "VLLM_LORA_ENABLE_DUAL_STREAM, set VLLM_LORA_ENABLE_DUAL_STREAM=False"
+            )
+            envs.VLLM_LORA_ENABLE_DUAL_STREAM = False
         return self
 
     def verify_with_model_config(self, model_config: ModelConfig):