Add pynccl RS and AGv

wenscarl · wenscarl · commit 7fcb48e20f12 · 2025-06-25T21:47:04.000Z
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import threading
-from typing import Optional, Union, List
+from typing import List, Optional, Union
 from weakref import WeakValueDictionary
 
 import torch
@@ -144,9 +144,17 @@ def all_gatherv(self,
                     sizes: Optional[List[int]] = None):
         assert False, "not implemented"
 
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
+        assert False, "not implemented"
+
     def reduce_scatter(self,
                        input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None) -> torch.Tensor:
+        assert sizes is None, "Varying size reduce scatter not supported with base device communicator"
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union, List
+from typing import List, Optional, Union
 
 import torch
 from torch.distributed import ProcessGroup
@@ -99,7 +99,10 @@ def all_reduce(self, input_):
             torch.distributed.all_reduce(out, group=self.device_group)
         return out
 
-    def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
+    def reduce_scatter(self,
+                       input_: torch.Tensor,
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None):
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None
@@ -111,15 +114,20 @@ def reduce_scatter(self, input_: torch.Tensor, dim: int = -1):
         # the input_tensor contiguous. Possible bug in reduce_scatter_tensor?
         input_tensor = input_.movedim(0, dim).contiguous()
 
-        assert input_tensor.shape[0] % world_size == 0
-        chunk_size = input_tensor.shape[0] // world_size
+        if sizes is not None:
+            assert len(sizes) == world_size
+            assert input_tensor.shape[0] == sum(sizes)
+            chunk_size = sizes[self.rank_in_group]
+        else:
+            assert input_tensor.shape[0] % world_size == 0
+            chunk_size = input_tensor.shape[0] // world_size
         output_shape = (chunk_size, ) + input_tensor.shape[1:]
 
         output = torch.empty(output_shape,
                              dtype=input_tensor.dtype,
                              device=input_tensor.device)
 
-        pynccl_comm.reduce_scatter(output, input_)
+        pynccl_comm.reduce_scatter(output, input_, sizes=sizes)
 
         # Reshape before returning
         return output.movedim(0, dim).contiguous()
@@ -170,28 +178,34 @@ def destroy(self):
     Use this:
         ... = get_dp_group().all_gatherv([topk_weights, topk_ids, a1q, a1q_scale], dim=0, sizes=get_forward_context().dp_metadata.num_tokens_across_dp_cpu)
     """
-    def all_gatherv(self, input_: Union[torch.Tensor, List[torch.Tensor]], dim: int = 0, sizes: Optional[List[int]] = None):
+
+    def all_gatherv(self,
+                    input_: Union[torch.Tensor, List[torch.Tensor]],
+                    dim: int = 0,
+                    sizes: Optional[List[int]] = None):
         assert dim == 0, "only dim 0 all-gather is supported"
         world_size = self.world_size
         pynccl_comm = self.pynccl_comm
         assert pynccl_comm is not None and not pynccl_comm.disabled
 
-        def _all_gather_single(input_: torch.Tensor, sizes: Optional[List[int]] = None):
+        def _all_gather_single(input_: torch.Tensor,
+                               sizes: Optional[List[int]] = None):
             input_size = input_.size()
             if sizes is not None:
                 assert len(sizes) == world_size
                 assert input_.shape[dim] == sizes[self.rank_in_group]
+                output_size = (sum(sizes), ) + input_size[1:]
                 # 'sizes' is not needed if all inputs in the same group have the same shape
                 if all(s == sizes[0] for s in sizes):
                     sizes = None
-                output_size = (sum(sizes),) + input_size[1:]
             else:
-                output_size = (input_size[0] * world_size,) + input_size[1:]
+                output_size = (input_size[0] * world_size, ) + input_size[1:]
             # Allocate output tensor.
-            output_tensor = torch.empty(
-                output_size, dtype=input_.dtype, device=input_.device
-            )
+            output_tensor = torch.empty(output_size,
+                                        dtype=input_.dtype,
+                                        device=input_.device)
             pynccl_comm.all_gather(output_tensor, input_, sizes=sizes)
+            return output_tensor
 
         if isinstance(input_, torch.Tensor):
             return _all_gather_single(input_, sizes)
@@ -201,7 +215,6 @@ def _all_gather_single(input_: torch.Tensor, sizes: Optional[List[int]] = None):
         for inp in input_:
             output_list.append(_all_gather_single(inp, sizes=sizes))
         pynccl_comm.group_end()
-        
         return output_list
 
     def dispatch(
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import Optional, Union, List
+from typing import List, Optional, Union
 
+import numpy as np
 # ===================== import region =====================
 import torch
 import torch.distributed as dist
@@ -147,13 +148,14 @@ def all_gather(self,
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
             stream = current_stream()
-        if sizes:
+        if sizes is not None:
             assert output_tensor.shape[0] == sum(sizes)
             numel_base = int(np.prod(output_tensor.shape[1:]))
             split_offset = 0
             self.nccl.ncclGroupStart()
             for root, split_size in enumerate(sizes):
-                dst_slice = output_tensor[split_offset:split_offset + split_size]
+                dst_slice = output_tensor[split_offset:split_offset +
+                                          split_size]
                 self.nccl.ncclBroadcast(
                     buffer_type(input_tensor.data_ptr()),
                     buffer_type(dst_slice.data_ptr()),
@@ -176,7 +178,8 @@ def reduce_scatter(self,
                        output_tensor: torch.Tensor,
                        input_tensor: torch.Tensor,
                        op: ReduceOp = ReduceOp.SUM,
-                       stream=None):
+                       stream=None,
+                       sizes: Optional[List[int]] = None):
         if self.disabled:
             return
         # nccl communicator created on a specific device
@@ -187,12 +190,29 @@ def reduce_scatter(self,
             f"but the input tensor is on {input_tensor.device}")
         if stream is None:
             stream = current_stream()
-        self.nccl.ncclReduceScatter(
-            buffer_type(input_tensor.data_ptr()),
-            buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
-            ncclDataTypeEnum.from_torch(input_tensor.dtype),
-            ncclRedOpTypeEnum.from_torch(op), self.comm,
-            cudaStream_t(stream.cuda_stream))
+
+        if sizes is not None:
+            numel_base = int(np.prod(input_tensor.shape[1:]))
+            split_offset = 0
+            self.nccl.ncclGroupStart()
+            for root, split_size in enumerate(sizes):
+                chunk = input_tensor[split_offset:split_offset + split_size, :]
+                self.nccl.ncclReduce(
+                    buffer_type(chunk.data_ptr()),
+                    buffer_type(output_tensor.data_ptr()),
+                    split_size * numel_base,
+                    ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                    ncclRedOpTypeEnum.from_torch(op), root, self.comm,
+                    cudaStream_t(stream.cuda_stream))
+                split_offset += split_size
+            self.nccl.ncclGroupEnd()
+        else:
+            self.nccl.ncclReduceScatter(
+                buffer_type(input_tensor.data_ptr()),
+                buffer_type(output_tensor.data_ptr()), output_tensor.numel(),
+                ncclDataTypeEnum.from_torch(input_tensor.dtype),
+                ncclRedOpTypeEnum.from_torch(op), self.comm,
+                cudaStream_t(stream.cuda_stream))
 
     def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
@@ -236,9 +256,9 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
         self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(),
                                 ncclDataTypeEnum.from_torch(tensor.dtype), src,
                                 self.comm, cudaStream_t(stream.cuda_stream))
-        
+
     def group_start(self):
         self.nccl.ncclGroupStart()
 
     def group_end(self):
-        self.nccl.ncclGroupEnd()
+        self.nccl.ncclGroupEnd()
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -154,6 +154,16 @@ class NCCLLibrary:
             ncclRedOp_t, ncclComm_t, cudaStream_t
         ]),
 
+        # ncclResult_t  ncclReduce(
+        #   const void* sendbuff, void* recvbuff, size_t count,
+        #   ncclDataType_t datatype, ncclRedOp_t op, int root,
+        #   ncclComm_t comm,  cudaStream_t stream);
+        # note that cudaStream_t is a pointer type, so the last argument
+        # is a pointer
+        Function("ncclReduce", ncclResult_t, [
+            buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t,
+            ncclRedOp_t, ctypes.c_int, ncclComm_t, cudaStream_t
+        ]),
         # ncclResult_t  ncclAllGather(
         #   const void* sendbuff, void* recvbuff, size_t count,
         #   ncclDataType_t datatype, ncclComm_t comm,
@@ -207,7 +217,7 @@ class NCCLLibrary:
         # it is better not to call it at all.
         # ncclResult_t  ncclCommDestroy(ncclComm_t comm);
         Function("ncclCommDestroy", ncclResult_t, [ncclComm_t]),
-        # ncclResult_t ncclGroupStart();Add commentMore actions
+        # ncclResult_t ncclGroupStart();
         Function("ncclGroupStart", ncclResult_t, []),
         # ncclResult_t ncclGroupEnd();
         Function("ncclGroupEnd", ncclResult_t, []),
@@ -304,6 +314,18 @@ def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
                                                      datatype, op, comm,
                                                      stream))
 
+    def ncclReduce(self, sendbuff: buffer_type, recvbuff: buffer_type,
+                   count: int, datatype: int, op: int, root: int,
+                   comm: ncclComm_t, stream: cudaStream_t) -> None:
+        # `datatype` actually should be `ncclDataType_t`
+        # and `op` should be `ncclRedOp_t`
+        # both are aliases of `ctypes.c_int`
+        # when we pass int to a function, it will be converted to `ctypes.c_int`
+        # by ctypes automatically
+        self.NCCL_CHECK(self._funcs["ncclReduce"](sendbuff, recvbuff, count,
+                                                  datatype, op, root, comm,
+                                                  stream))
+
     def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type,
                           count: int, datatype: int, op: int, comm: ncclComm_t,
                           stream: cudaStream_t) -> None:
@@ -348,9 +370,11 @@ def ncclCommDestroy(self, comm: ncclComm_t) -> None:
 
     def ncclGroupStart(self) -> None:
         self.NCCL_CHECK(self._funcs["ncclGroupStart"]())
+
     def ncclGroupEnd(self) -> None:
         self.NCCL_CHECK(self._funcs["ncclGroupEnd"]())
 
+
 __all__ = [
     "NCCLLibrary", "ncclDataTypeEnum", "ncclRedOpTypeEnum", "ncclUniqueId",
     "ncclComm_t", "cudaStream_t", "buffer_type"
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -30,7 +30,7 @@
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
-from typing import Any, Callable, Optional, Union, List
+from typing import Any, Callable, List, Optional, Union
 from unittest.mock import patch
 
 import torch
@@ -380,16 +380,17 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
     def _all_gather_out_place(self, input_: torch.Tensor,
                               dim: int) -> torch.Tensor:
         return self.device_communicator.all_gather(input_, dim)
-    
-    def all_gatherv(self, 
+
+    def all_gatherv(self,
                     input_: Union[torch.Tensor, List[torch.Tensor]],
                     dim: int = 0,
                     sizes: Optional[List[int]] = None):
         return self.device_communicator.all_gatherv(input_, dim, sizes)
-    
+
     def reduce_scatter(self,
                        input_: torch.Tensor,
-                       dim: int = -1) -> torch.Tensor:
+                       dim: int = -1,
+                       sizes: Optional[List[int]] = None) -> torch.Tensor:
         world_size = self.world_size
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
@@ -398,16 +399,20 @@ def reduce_scatter(self,
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
 
         if self.use_custom_op_call:
+            assert sizes is None, "Varying size reduce scatter not supported with vllm custom op"
             return torch.ops.vllm.reduce_scatter(input_,
                                                  dim,
                                                  world_size,
                                                  group_name=self.unique_name)
         else:
-            return self._reduce_scatter_out_place(input_, dim)
-
-    def _reduce_scatter_out_place(self, input_: torch.Tensor,
-                                  dim: int) -> torch.Tensor:
-        return self.device_communicator.reduce_scatter(input_, dim)
+            return self._reduce_scatter_out_place(input_, dim, sizes)
+
+    def _reduce_scatter_out_place(
+            self,
+            input_: torch.Tensor,
+            dim: int,
+            sizes: Optional[List[int]] = None) -> torch.Tensor:
+        return self.device_communicator.reduce_scatter(input_, dim, sizes)
 
     def gather(self,
                input_: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py