vllm-project · robertgshaw2-redhat · Jan 21, 2026 · Jan 21, 2026 · gemini-code-assist · Jan 21, 2026
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -72,7 +72,8 @@ class ncclDataTypeEnum:
     ncclFloat64 = 8
     ncclDouble = 8
     ncclBfloat16 = 9
-    ncclNumTypes = 10
+    ncclFloat8e4m3 = 10
+    ncclNumTypes = 11
 
     @classmethod
     def from_torch(cls, dtype: torch.dtype) -> int:
@@ -92,9 +93,12 @@ def from_torch(cls, dtype: torch.dtype) -> int:
             return cls.ncclFloat64
         if dtype == torch.bfloat16:
             return cls.ncclBfloat16
+        if dtype == torch.float8_e4m3fn:
+            return cls.ncclFloat8e4m3
         raise ValueError(
             f"Unsupported dtype {dtype}: should be one of "
-            f"int8, uint8, int32, int64, float16, float32, float64, bfloat16."
+            f"int8, uint8, int32, int64, float16, float32, float64, bfloat16,"
+            " float8e4m3."
         )
 
 

diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -78,14 +78,6 @@ def prepare(
         # TODO - this is just for deepgemm?
         expert_tokens_meta = None
 
-        from vllm.platforms import current_platform
-
-        # The torch ops do not support fp8, so use an int8 view.
-        # Since dispatch does not do a reduce, this is safe to do.
-        use_int8_view = a1q.dtype == current_platform.fp8_dtype()
-        if use_int8_view:
-            a1q = a1q.view(torch.int8)
-
         # Skip gathering scales if we have static quantization
         # (the scale is a scalar, replicated on all ranks) or
         # if quantization is deferred.
@@ -106,9 +98,6 @@ def prepare(
             assert scales is not None and len(scales) == 1
             a1q_scale = scales[0]
 
-        if use_int8_view:
-            a1q = a1q.view(current_platform.fp8_dtype())
-
         # NOTE: shuffle into format expected by FLASHINFER_CUTLASS
         # There are currently no other kernels that use this P/F
         # with nvfp4. If we add other kernels in the future, we

diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -37,7 +37,8 @@ def __init__(
             use_overlapped
             and not (
                 (self.enable_eplb and backend != "allgather_reducescatter")
-                or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
+                # TODO: Is this correct?
+                or self.moe_parallel_config.use_fi_all2allv_kernels
             )
             and self._shared_experts is not None
         )