Revert "[Disco][3rdparty] Add latency optimized all reduce kernels."

csullivan · csullivan · commit 20efa23e3758 · 2024-01-22T20:38:54.000Z
This reverts commit ee5c994.
diff --git a/.gitmodules b/.gitmodules
@@ -31,6 +31,3 @@
 [submodule "3rdparty/flashinfer"]
 	path = 3rdparty/flashinfer
 	url = https://github.com/flashinfer-ai/flashinfer.git
-[submodule "3rdparty/trt-llm-allreduce"]
-	path = 3rdparty/trt-llm-allreduce
-	url = git@github.com:csullivan/trt-llm-allreduce.git
diff --git a/3rdparty/trt-llm-allreduce b/3rdparty/trt-llm-allreduce
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -908,9 +908,6 @@ if(USE_CUDA AND USE_NCCL)
   find_library(LIBRT rt)
   target_link_libraries(tvm PRIVATE nccl ${LIBRT})
   target_link_libraries(tvm_runtime PRIVATE nccl ${LIBRT})
-  install(TARGETS trtllm_allreduce EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
-  target_link_libraries(tvm PRIVATE -Wl,--no-as-needed trtllm_allreduce)
-  target_link_libraries(tvm_runtime PRIVATE -Wl,--no-as-needed trtllm_allreduce)
 endif()
 
 if(USE_ROCM AND USE_RCCL)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
@@ -47,10 +47,6 @@ if(USE_CUDA)
     set(CMAKE_CUDA_ARCHITECTURES native)
   endif()
 
-  if(USE_CUDA AND USE_NCCL)
-    add_subdirectory(${PROJECT_SOURCE_DIR}/3rdparty/trt-llm-allreduce)
-  endif()
-
   if(USE_CUDNN)
     message(STATUS "Build with cuDNN support")
     include_directories(SYSTEM ${CUDA_CUDNN_INCLUDE_DIRS})
diff --git a/src/runtime/disco/nccl/nccl.cc b/src/runtime/disco/nccl/nccl.cc
@@ -24,7 +24,6 @@
 #include <tvm/runtime/registry.h>
 
 #include <cstring>
-#include <memory>
 #include <mutex>
 #include <sstream>
 #include <vector>
@@ -39,7 +38,6 @@
 #if TVM_NCCL_RCCL_SWITCH == 0
 #include <nccl.h>
 
-#include "../../../../3rdparty/trt-llm-allreduce/include/cuda_allreduce.h"
 #include "../../cuda/cuda_common.h"
 #else
 #include <rccl/rccl.h>
@@ -142,7 +140,6 @@ struct CCLThreadLocalContext {
   int device_id;
   deviceStream_t default_stream = nullptr;
   ncclComm_t comm;
-  std::unique_ptr<CustomAllReduce> custom_allreduce;
 
   void Clear() {
     NCCL_CALL(ncclCommDestroy(comm));
@@ -193,8 +190,6 @@ void InitCCLPerWorker(IntTuple device_ids, std::string unique_id_bytes) {
   worker->ccl = TVM_DISCO_CCL_NAME;
   ctx->worker = worker;
   ctx->device_id = device_id;
-  ctx->custom_allreduce =
-      std::make_unique<CustomAllReduce>(worker->num_workers, worker->worker_id, ctx->comm);
   // Initialize the communicator
   ncclUniqueId id;
   std::memcpy(id.internal, unique_id_bytes.data(), NCCL_UNIQUE_ID_BYTES);
@@ -206,13 +201,6 @@ void AllReduce(NDArray send, ReduceKind reduce_kind, NDArray recv) {
   ShapeTuple shape = send.Shape();
   int64_t numel = shape->Product();
   deviceStream_t stream = ctx->GetDefaultStream();
-  // TODO(csullivan) make this work
-  // 1. pass type in
-  // 2. src and dest args
-  // 3. some strategy selection outside, if (!enqueu) do nccl?
-  // 3. reduce kind
-  // 4. pass stream in to custom api
-  // ctx->custom_allreduce->enqueue(send->data, numel);
   NCCL_CALL(ncclAllReduce(send->data, recv->data, numel,
                           /*datatype=*/AsNCCLDataType(DataType(send->dtype)),
                           /*op=*/AsNCCLRedOp(reduce_kind), ctx->comm, stream));