wip

LucasWilkinson · LucasWilkinson · commit 18bf91e6a898 · 2025-05-23T03:31:49.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/examples/basic-ub.py b/examples/basic-ub.py
@@ -40,11 +40,11 @@ def main():
               max_model_len=1024,
               #load_format="dummy",
               ###############
-              tensor_parallel_size=1,
-              #data_parallel_size=2,
-              enable_expert_parallel=False,
+              #tensor_parallel_size=1,
+              data_parallel_size=2,
+              enable_expert_parallel=True,
               ###############
-              enable_microbatching=True, 
+              #enable_microbatching=True, 
     )
     # Generate texts from the prompts.
     # The output is a list of RequestOutput objects
diff --git a/vllm/config.py b/vllm/config.py
@@ -4332,7 +4332,7 @@ def __post_init__(self):
                 logger.warning_once(
                     "Piecewise compilation is not supported with "
                     "microbatching. Disabling piecewiseching compilation.")
-                self.compilation_config.level = CompilationLevel.DYNAMO_ONCE
+                self.compilation_config.level = CompilationLevel.NO_COMPILATION
              
 
         if self.model_config and self.model_config.use_mla and \
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -7,6 +7,7 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
+from vllm.v1.worker.ubatching import get_current_ubatch_context, yield_impl
 
 
 # Note use: layer.get_all_to_all() to get an AllToAll instance
@@ -117,7 +118,11 @@ def dispatch(send: bool):
                 do_send=send,
                 do_recv=not send,
             )
+        
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
         dispatch(True) # Send
+        yield_impl(gpu_wait=False)
         dispatch(False) # Recv
 
         return expert_x, expert_x_scale, expert_num_tokens
@@ -155,5 +160,8 @@ def combine(send: bool):
                 do_send=send,
                 do_recv=not send,
             )
+        # if ubatch_ctx is not None:
+        #     ubatch_ctx.gpu_stream_wait()
         combine(True)
+        yield_impl(gpu_wait=False)
         combine(False)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -49,6 +49,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.worker.ubatching import get_current_ubatch_context
 
 from .interfaces import SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -656,6 +657,9 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
+        if ubatch_ctx := get_current_ubatch_context() is not None:
+            print("in forward, ubatch:", ubatch_ctx.id)
+        
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds