Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions vllm/distributed/device_communicators/pynccl_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class ncclDataTypeEnum:
ncclFloat64 = 8
ncclDouble = 8
ncclBfloat16 = 9
ncclNumTypes = 10
ncclFloat8e4m3 = 10
ncclNumTypes = 11

@classmethod
def from_torch(cls, dtype: torch.dtype) -> int:
Expand All @@ -92,9 +93,12 @@ def from_torch(cls, dtype: torch.dtype) -> int:
return cls.ncclFloat64
if dtype == torch.bfloat16:
return cls.ncclBfloat16
if dtype == torch.float8_e4m3fn:
return cls.ncclFloat8e4m3
raise ValueError(
f"Unsupported dtype {dtype}: should be one of "
f"int8, uint8, int32, int64, float16, float32, float64, bfloat16."
f"int8, uint8, int32, int64, float16, float32, float64, bfloat16,"
" float8e4m3."
)


Expand Down
11 changes: 0 additions & 11 deletions vllm/model_executor/layers/fused_moe/prepare_finalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,6 @@ def prepare(
# TODO - this is just for deepgemm?
expert_tokens_meta = None

from vllm.platforms import current_platform

# The torch ops do not support fp8, so use an int8 view.
# Since dispatch does not do a reduce, this is safe to do.
use_int8_view = a1q.dtype == current_platform.fp8_dtype()
if use_int8_view:
a1q = a1q.view(torch.int8)

# Skip gathering scales if we have static quantization
# (the scale is a scalar, replicated on all ranks) or
# if quantization is deferred.
Expand All @@ -106,9 +98,6 @@ def prepare(
assert scales is not None and len(scales) == 1
a1q_scale = scales[0]

if use_int8_view:
a1q = a1q.view(current_platform.fp8_dtype())

# NOTE: shuffle into format expected by FLASHINFER_CUTLASS
# There are currently no other kernels that use this P/F
# with nvfp4. If we add other kernels in the future, we
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/fused_moe/shared_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ def __init__(
use_overlapped
and not (
(self.enable_eplb and backend != "allgather_reducescatter")
or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
# TODO: Is this correct?
or self.moe_parallel_config.use_fi_all2allv_kernels
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The TODO: Is this correct? comment indicates uncertainty about the logic for disabling shared expert overlap when use_fi_all2allv_kernels is true. If this condition is incorrect, it could lead to improper disabling of shared expert overlap, potentially impacting performance or correctness. Please verify this logic and either remove the TODO comment with a clarifying explanation or correct the condition if it's found to be wrong.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed DP check changes overlap disabling behavior

Medium Severity

The condition for disabling shared expert overlap changed from checking self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1 to just self.moe_parallel_config.use_fi_all2allv_kernels. The original comment indicated overlap was only disabled "with DP, since there nothing to gain." The removal of the dp_size > 1 check means overlap is now disabled even when dp_size == 1, which may unnecessarily reduce performance. The TODO comment "Is this correct?" indicates the author's uncertainty about this change.

Fix in Cursor Fix in Web

)
and self._shared_experts is not None
)
Expand Down
Loading