diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 67b6cbe535a4..3b11595b4e4e 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -72,7 +72,8 @@ class ncclDataTypeEnum: ncclFloat64 = 8 ncclDouble = 8 ncclBfloat16 = 9 - ncclNumTypes = 10 + ncclFloat8e4m3 = 10 + ncclNumTypes = 11 @classmethod def from_torch(cls, dtype: torch.dtype) -> int: @@ -92,9 +93,12 @@ def from_torch(cls, dtype: torch.dtype) -> int: return cls.ncclFloat64 if dtype == torch.bfloat16: return cls.ncclBfloat16 + if dtype == torch.float8_e4m3fn: + return cls.ncclFloat8e4m3 raise ValueError( f"Unsupported dtype {dtype}: should be one of " - f"int8, uint8, int32, int64, float16, float32, float64, bfloat16." + f"int8, uint8, int32, int64, float16, float32, float64, bfloat16," + " float8e4m3." ) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 2f0ea6f9bdb0..71e3ac14ad80 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -78,14 +78,6 @@ def prepare( # TODO - this is just for deepgemm? expert_tokens_meta = None - from vllm.platforms import current_platform - - # The torch ops do not support fp8, so use an int8 view. - # Since dispatch does not do a reduce, this is safe to do. - use_int8_view = a1q.dtype == current_platform.fp8_dtype() - if use_int8_view: - a1q = a1q.view(torch.int8) - # Skip gathering scales if we have static quantization # (the scale is a scalar, replicated on all ranks) or # if quantization is deferred. @@ -106,9 +98,6 @@ def prepare( assert scales is not None and len(scales) == 1 a1q_scale = scales[0] - if use_int8_view: - a1q = a1q.view(current_platform.fp8_dtype()) - # NOTE: shuffle into format expected by FLASHINFER_CUTLASS # There are currently no other kernels that use this P/F # with nvfp4. If we add other kernels in the future, we diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index a143347b19f2..d85408d8ee50 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -37,7 +37,8 @@ def __init__( use_overlapped and not ( (self.enable_eplb and backend != "allgather_reducescatter") - or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1) + # TODO: Is this correct? + or self.moe_parallel_config.use_fi_all2allv_kernels ) and self._shared_experts is not None )