vllm-project · ywang96 · May 26, 2026 · Apr 26, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/benchmarks/kernels/benchmark_fused_moe_lora_one_shot.py b/benchmarks/kernels/benchmark_fused_moe_lora_one_shot.py
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1971,6 +1971,8 @@ def _resolve_rust_frontend_path() -> str | None:
         int(os.getenv("VLLM_USE_SIMPLE_KV_OFFLOAD", "0"))
     ),
     # Whether to enable dual cuda streams for LoRA computation
+    # (used by both BaseLinearLayerWithLoRA and FusedMoEWithLoRA to
+    # overlap the base layer compute with the LoRA fast path).
     "VLLM_LORA_ENABLE_DUAL_STREAM": lambda: bool(
         int(os.getenv("VLLM_LORA_ENABLE_DUAL_STREAM", "0"))
     ),

@@ -25,16 +25,9 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 
 from .base import BaseLayerWithLoRA
-from .utils import _get_lora_device
+from .utils import _get_lora_aux_cuda_stream, _get_lora_device
 
 if envs.VLLM_LORA_ENABLE_DUAL_STREAM:
-    _lora_aux_cuda_stream: torch.cuda.Stream | None = None
-
-    def _get_lora_aux_cuda_stream() -> torch.cuda.Stream | None:
-        global _lora_aux_cuda_stream
-        if _lora_aux_cuda_stream is None and current_platform.is_cuda_alike():
-            _lora_aux_cuda_stream = torch.cuda.Stream()
-        return _lora_aux_cuda_stream
 
     def lora_linear_async(
         layer_name: str,

@@ -19,8 +19,9 @@
 from vllm.model_executor.layers.fused_moe.prepare_finalize import (
     MoEPrepareAndFinalizeNoDPEPModular,
 )
+from vllm.platforms import current_platform
 
-from .utils import _get_lora_device
+from .utils import _get_lora_aux_cuda_stream, _get_lora_device
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -34,6 +35,9 @@ def __init__(self, base_layer: FusedMoE) -> None:
         self.tp_size = self.base_layer.tp_size
         self.tp_rank = self.base_layer.tp_rank
         self.device = _get_lora_device(base_layer)
+
+        self._enable_aux_cuda_stream = envs.VLLM_LORA_ENABLE_DUAL_STREAM
+        self._init_lora_stream_context()
         # For non-gated MoE (is_act_and_mul=False), only 1 slice is needed
         # since there's only up_proj (w1), not gate_proj + up_proj (w1 + w3)
         self._w13_slices = 2 if base_layer.moe_config.is_act_and_mul else 1
@@ -65,7 +69,25 @@ def __init__(self, base_layer: FusedMoE) -> None:
             FusedMoEModularMethod(self.base_layer.quant_method, moe_kernel)
         )
 
+    def _init_lora_stream_context(self) -> None:
+        self._lora_stream: torch.cuda.Stream | None = None
+        self._events: tuple[torch.cuda.Event, ...] | None = None
+        if not self._enable_aux_cuda_stream:
+            return
+        if not current_platform.is_cuda_alike():
+            return
+        self._lora_stream = _get_lora_aux_cuda_stream()
+        # 4 events: 2 per (base GEMM, LoRA) pair so w13 and w2 don't reuse
+        # the same event objects; reuse-within-a-pair is fine because the
+        # second pair starts only after intermediate_cache1.add_() has joined.
+        self._events = tuple(torch.cuda.Event() for _ in range(4))
+
     def _build_lora_context(self):
+        use_dual_stream = (
+            self._enable_aux_cuda_stream
+            and not self.fully_sharded
+            and self._lora_stream is not None
+        )
         return MoELoRAContext(
             w13_lora_a_stacked=self.w13_lora_a_stacked,
             w13_lora_b_stacked=self.w13_lora_b_stacked,
@@ -81,6 +103,8 @@ def _build_lora_context(self):
             local_num_experts=self.base_layer.local_num_experts,
             punica_wrapper=self.punica_wrapper,
             use_tuned_config=bool(envs.VLLM_TUNED_CONFIG_FOLDER),
+            aux_stream=self._lora_stream if use_dual_stream else None,
+            events=self._events if use_dual_stream else None,
         )
 
     def _create_lora_a_weights(

@@ -7,9 +7,22 @@
 import torch
 import torch.nn as nn
 
+from vllm import envs
 from vllm.model_executor.layers.fused_moe.fused_moe import try_get_optimal_moe_config
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import next_power_of_2
 
+_lora_aux_cuda_stream: torch.cuda.Stream | None = None
+
+
+def _get_lora_aux_cuda_stream() -> torch.cuda.Stream | None:
+    if not envs.VLLM_LORA_ENABLE_DUAL_STREAM:
+        return None
+    global _lora_aux_cuda_stream
+    if _lora_aux_cuda_stream is None and current_platform.is_cuda_alike():
+        _lora_aux_cuda_stream = torch.cuda.Stream()
+    return _lora_aux_cuda_stream
+
 
 class LoRAMappingType(Enum):
     LANGUAGE = 1