vllm-project · amirkl94 · Jan 13, 2026
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -72,7 +72,8 @@ class ncclDataTypeEnum:
     ncclFloat64 = 8
     ncclDouble = 8
     ncclBfloat16 = 9
-    ncclNumTypes = 10
+    ncclFloat8e4m3 = 10
+    ncclNumTypes = 11
 
     @classmethod
     def from_torch(cls, dtype: torch.dtype) -> int:
@@ -92,9 +93,12 @@ def from_torch(cls, dtype: torch.dtype) -> int:
             return cls.ncclFloat64
         if dtype == torch.bfloat16:
             return cls.ncclBfloat16
+        if dtype == torch.float8_e4m3fn:
+            return cls.ncclFloat8e4m3
         raise ValueError(
             f"Unsupported dtype {dtype}: should be one of "
-            f"int8, uint8, int32, int64, float16, float32, float64, bfloat16."
+            f"int8, uint8, int32, int64, float16, float32, float64, bfloat16,"
+            " float8e4m3."
         )
 
 

@@ -168,8 +168,10 @@ def __init__(
         use_dp: bool,
         num_dispatchers: int = 1,
         use_deepseek_fp8_block_scale: bool = False,
+        use_ep: bool = True,
     ):
         super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
+        self.use_ep = use_ep
 
     def prepare(
         self,
@@ -197,12 +199,15 @@ def prepare(
                 quant_config.block_shape,
                 is_fp4_scale_swizzled=not self.use_dp,
             )
+            if not is_nvfp4:
+                # per-tensor scales are static and shouldn't be communicated
+                a1q_scale = None
         else:
             # Block-scale path: pass activations through, omit per-token scales
             a1q = a1
             a1q_scale = None
 
-        if self.use_dp:
+        if self.use_dp and self.use_ep:
             # Build gather list conditionally - omit a1q_scale if None
             # (block-scale path)
             gather_list = [topk_weights, topk_ids, a1q]
@@ -225,6 +230,8 @@ def prepare(
 
         if is_nvfp4 and a1q_scale is not None:
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
+        elif not self.use_deepseek_fp8_block_scale and not is_nvfp4:
+            a1q_scale = quant_config.a1_scale
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
 
@@ -239,7 +246,7 @@ def finalize(
     ) -> None:
         assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP)
 
-        if self.use_dp:
+        if self.use_dp and self.use_ep:
             fused_expert_output = get_dp_group().reduce_scatterv(
                 fused_expert_output, dim=0, sizes=get_local_sizes()
             )
@@ -353,6 +360,7 @@ def create_flashinfer_prepare_finalize(
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
     use_deepseek_fp8_block_scale: bool = False,
+    use_ep: bool = True,
 ) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP:
     """Factory function to create the appropriate FlashInfer implementation."""
 
@@ -361,8 +369,9 @@ def create_flashinfer_prepare_finalize(
             assert use_nvfp4
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
         return FlashInferAllGatherMoEPrepareAndFinalize(
-            use_dp=True,
+            use_dp=use_dp,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
+            use_ep=use_ep,
         )
     else:
         # CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization

@@ -199,7 +199,9 @@ def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
     # Propagate block-scale flag so prepare/finalize can skip act quantization
     # and inform the kernel to consume per-block weight scales.
     return create_flashinfer_prepare_finalize(
-        use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+        use_dp,
+        use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
+        use_ep=moe.use_ep if moe is not None else False,
     )