diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 67b6cbe535a4..3b11595b4e4e 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -72,7 +72,8 @@ class ncclDataTypeEnum: ncclFloat64 = 8 ncclDouble = 8 ncclBfloat16 = 9 - ncclNumTypes = 10 + ncclFloat8e4m3 = 10 + ncclNumTypes = 11 @classmethod def from_torch(cls, dtype: torch.dtype) -> int: @@ -92,9 +93,12 @@ def from_torch(cls, dtype: torch.dtype) -> int: return cls.ncclFloat64 if dtype == torch.bfloat16: return cls.ncclBfloat16 + if dtype == torch.float8_e4m3fn: + return cls.ncclFloat8e4m3 raise ValueError( f"Unsupported dtype {dtype}: should be one of " - f"int8, uint8, int32, int64, float16, float32, float64, bfloat16." + f"int8, uint8, int32, int64, float16, float32, float64, bfloat16," + " float8e4m3." ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index dfff860750d6..d8cd6148bb31 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -168,8 +168,10 @@ def __init__( use_dp: bool, num_dispatchers: int = 1, use_deepseek_fp8_block_scale: bool = False, + use_ep: bool = True, ): super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale) + self.use_ep = use_ep def prepare( self, @@ -197,12 +199,15 @@ def prepare( quant_config.block_shape, is_fp4_scale_swizzled=not self.use_dp, ) + if not is_nvfp4: + # per-tensor scales are static and shouldn't be communicated + a1q_scale = None else: # Block-scale path: pass activations through, omit per-token scales a1q = a1 a1q_scale = None - if self.use_dp: + if self.use_dp and self.use_ep: # Build gather list conditionally - omit a1q_scale if None # (block-scale path) gather_list = [topk_weights, topk_ids, a1q] @@ -225,6 +230,8 @@ def prepare( if is_nvfp4 and a1q_scale is not None: a1q_scale = nvfp4_block_scale_interleave(a1q_scale) + elif not self.use_deepseek_fp8_block_scale and not is_nvfp4: + a1q_scale = quant_config.a1_scale return a1q, a1q_scale, None, topk_ids, topk_weights @@ -239,7 +246,7 @@ def finalize( ) -> None: assert isinstance(weight_and_reduce_impl, TopKWeightAndReduceNoOP) - if self.use_dp: + if self.use_dp and self.use_ep: fused_expert_output = get_dp_group().reduce_scatterv( fused_expert_output, dim=0, sizes=get_local_sizes() ) @@ -353,6 +360,7 @@ def create_flashinfer_prepare_finalize( use_nvfp4: bool = False, enable_alltoallv: bool = False, use_deepseek_fp8_block_scale: bool = False, + use_ep: bool = True, ) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP: """Factory function to create the appropriate FlashInfer implementation.""" @@ -361,8 +369,9 @@ def create_flashinfer_prepare_finalize( assert use_nvfp4 return FlashInferAllToAllMoEPrepareAndFinalize(use_dp) return FlashInferAllGatherMoEPrepareAndFinalize( - use_dp=True, + use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + use_ep=use_ep, ) else: # CUTLASS FP8 BLOCK and CUTLASS NVFP4 apply input quantization diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 799854479823..f698816f502e 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -199,7 +199,9 @@ def build_flashinfer_fp8_cutlass_moe_prepare_finalize( # Propagate block-scale flag so prepare/finalize can skip act quantization # and inform the kernel to consume per-block weight scales. return create_flashinfer_prepare_finalize( - use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale + use_dp, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + use_ep=moe.use_ep if moe is not None else False, )