Add UB NCCL integration

Tabrizian · Tabrizian · commit 3f77fe7e4276 · 2025-05-20T13:47:08.000-07:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
@@ -14,61 +14,100 @@
  * limitations under the License.
  */
 #include "ub_allocator.h"
+#include "tensorrt_llm/common/opUtils.h"
+#include <set>
 
 namespace tensorrt_llm::runtime::ub
 {
 UserBufferAllocator& UserBufferAllocator::Instance()
 {
-    static UserBufferAllocator _;
-    return _;
+    // if environment variable TLLM_USE_NCCL_UB is set to 1, use NCCLUserBufferAllocator
+    char* useNCCLUB = std::getenv("TLLM_USE_NCCL_UB");
+    if (useNCCLUB != nullptr)
+    {
+        static NCCLUserBufferAllocator _;
+        return _;
+    }
+    else
+    {
+        static UserBufferAllocator _;
+        return _;
+    }
 }
 
-void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& world_config)
+void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
 {
-    if (!is_initialized())
+    if (!isInitialized())
     {
-        ub_comm_ = nullptr;
-        world_config_ = world_config;
-        create_communicator_grouped2(&ub_comm_, world_config_);
-        TLLM_CHECK(ub_comm_ != nullptr);
-        is_initialized_ = true;
+        mUbComm = nullptr;
+        mWorldConfig = worldConfig;
+        create_communicator_grouped2(&mUbComm, worldConfig);
+        TLLM_CHECK(mUbComm != nullptr);
+        mIsInitialized = true;
     }
 }
 
-bool UserBufferAllocator::is_initialized()
+bool UserBufferAllocator::isInitialized()
 {
-    return is_initialized_;
+    return mIsInitialized;
 }
 
-UBBuffer UserBufferAllocator::register_ub_buffer(size_t bytes)
+UBBuffer UserBufferAllocator::registerUBBuffer(size_t bytes)
 {
-    TLLM_CHECK(is_initialized());
+    TLLM_CHECK(isInitialized());
     void* addr = nullptr;
     int handle = -1;
-    handle = register_user_buffer_collective((void**) &addr, bytes, ub_comm_);
+    handle = register_user_buffer_collective((void**) &addr, bytes, mUbComm);
     return {addr, handle, bytes};
 }
 
 UBBuffer UserBufferAllocator::allocate(size_t bytes)
 {
-    TLLM_CHECK(is_initialized());
-    auto ub_buffer = register_ub_buffer(bytes);
+    TLLM_CHECK(isInitialized());
+    auto ub_buffer = registerUBBuffer(bytes);
     TLLM_CHECK(!ub_buffer.invalid());
-    buffers_.push_back(ub_buffer);
+    mBuffers.push_back(ub_buffer);
     return ub_buffer;
 }
 
 void UserBufferAllocator::deallocate(void* addr) {}
 
 UBBuffer UserBufferAllocator::get(int idx)
 {
-    TLLM_CHECK(is_initialized() && idx < buffers_.size() && !buffers_[idx].invalid());
-    return buffers_[idx];
+    TLLM_CHECK(isInitialized() && idx < mBuffers.size() && !mBuffers[idx].invalid());
+    return mBuffers[idx];
 }
 
 communicator* UserBufferAllocator::comm()
 {
-    TLLM_CHECK(is_initialized());
-    return ub_comm_;
+    TLLM_CHECK(isInitialized());
+    return mUbComm;
+}
+
+void NCCLUserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig)
+{
+    if (!isInitialized())
+    {
+        TLLM_LOG_INFO("Initializing NCCLUserBufferAllocator");
+        std::set<int> group;
+        for (int i = 0; i < worldConfig.getSize(); i++)
+        {
+            group.insert(i);
+        }
+        mComm = getComm(group);
+        mIsInitialized = true;
+    }
 }
+
+UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
+{
+    TLLM_CHECK(isInitialized());
+    UBBuffer ub_buffer;
+    NCCLCHECK(ncclMemAlloc(&ub_buffer.addr, bytes));
+    NCCLCHECK(ncclCommWindowRegister((*mComm), ub_buffer.addr, bytes, &ub_buffer.window, NCCL_WIN_COLL_SYMMETRIC));
+    ub_buffer.handle = 5;
+    ub_buffer.size = bytes;
+    return ub_buffer;
+}
+
 }; // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
@@ -28,11 +28,13 @@ struct UBBuffer
     void* addr;
     int handle;
     size_t size;
+    ncclWindow_t window;
 
-    UBBuffer(void* a = nullptr, int h = -1, size_t s = 0)
+    UBBuffer(void* a = nullptr, int h = -1, size_t s = 0, ncclWindow_t w = nullptr)
         : addr(a)
         , handle(h)
         , size(s)
+        , window(w)
     {
     }
 
@@ -49,21 +51,33 @@ class UserBufferAllocator
 
     UserBufferAllocator() = default;
 
-    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config);
-    bool is_initialized();
+    virtual void initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig);
+    bool isInitialized();
     UBBuffer allocate(size_t bytes);
     void deallocate(void* addr);
     UBBuffer get(int idx);
     communicator* comm();
+    virtual UBBuffer registerUBBuffer(size_t bytes);
 
 private:
-    UBBuffer register_ub_buffer(size_t bytes);
+    communicator* mUbComm;
 
-    communicator* ub_comm_;
-    std::vector<UBBuffer> buffers_;
-    bool is_initialized_;
-    tensorrt_llm::runtime::WorldConfig world_config_;
+protected:
+    std::vector<UBBuffer> mBuffers;
+    bool mIsInitialized;
+    tensorrt_llm::runtime::WorldConfig mWorldConfig;
 };
+
+class NCCLUserBufferAllocator : public UserBufferAllocator
+{
+public:
+    void initialize(tensorrt_llm::runtime::WorldConfig const& world_config) override;
+    UBBuffer registerUBBuffer(size_t bytes) override;
+
+private:
+    std::shared_ptr<ncclComm_t> mComm;
+};
+
 #else
 using communicator = void;
 #endif
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp
@@ -36,7 +36,7 @@ void ub_initialize(int tp_size)
 
 bool ub_is_initialized()
 {
-    return UserBufferAllocator::Instance().is_initialized();
+    return UserBufferAllocator::Instance().isInitialized();
 }
 
 UBBuffer ub_allocate(size_t bytes)
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -161,8 +161,10 @@ class AllreduceOp
         size_t size = input.numel();
         size_t seq_len = input.size(0);
 
-        // If strategy is set to UB, UB must be used as UB impl output is special and cannot be used
-        // by others.
+        if (std::getenv("TLLM_USE_NCCL_UB") && mStrategy == AllReduceStrategyType::UB)
+        {
+            return runNCCLAllReduceUB(input, residual, norm_weight, scale, bias);
+        }
         AllReduceStrategyType runtime_strategy = getRuntimeStrategy(seq_len, size);
 
         // Log runtime strategy
@@ -299,6 +301,30 @@ class AllreduceOp
         return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduce_output);
     }
 
+    std::vector<torch::Tensor> runNCCLAllReduceUB(torch::Tensor const& input,
+        torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
+        torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
+    {
+
+        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+        int size = input.numel();
+        auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
+        auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
+        TLLM_CHECK(!ub_buffer0.invalid());
+        auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+
+        NCCLCHECK(ncclAllReduce(
+            input.data_ptr(), norm_out.mutable_data_ptr(), size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
+
+        if (mOp == AllReduceFusionOp::NONE)
+        {
+            return {norm_out};
+        }
+
+        // Treat any other patterns as fallback cases.
+        return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, norm_out);
+    }
+
     std::vector<torch::Tensor> runLowPrecisionAllReduce(torch::Tensor const& input,
         torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
         torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
@@ -64,6 +64,7 @@ def get_custom_pass(cls, enable_userbuffers):
                 register_ar_residual_norm(cls._custom_pass_instances[0])
                 if enable_userbuffers and tensorrt_llm.bindings.internal.userbuffers.ub_supported(
                 ):
+                    print("Registering UB patterns", flush=True)
                     register_ub_patterns(cls._custom_pass_instances)
             else:
                 register_add_norm(cls._custom_pass_instances[0])
diff --git a/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py b/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py
@@ -180,8 +180,61 @@ def extra_check_fp4_quant_pattern(match: Match) -> bool:
                 extra_check=extra_check_fp4_quant_pattern,
             )
 
+        def register_no_quant_pattern(custom_pass: PatternMatcherPass):
+            input_node = KeywordArg('input')
+            fusion = KeywordArg('fusion_op')
+            trtllm_allreduce_default = CallFunction(
+                torch.ops.trtllm.allreduce.default, input_node,
+                KeywordArg('residual_in'), KeywordArg('gamma'), Ignored(),
+                Ignored(), Ignored(), mapping.tp_group, strategy, fusion,
+                KeywordArg('eps'))
+            no_quant_pattern = MultiOutputPattern([trtllm_allreduce_default])
+
+            def empty_no_quant_pattern(
+                input: torch.Tensor,
+                residual_in: torch.Tensor,
+                gamma: torch.Tensor,
+                eps: float,
+            ):
+                return
+
+            def target_no_quant_pattern(
+                input: torch.Tensor,
+                residual_in: torch.Tensor,
+                gamma: torch.Tensor,
+                eps: float,
+            ):
+                input = torch.ops.trtllm.copy_to_userbuffers(input)
+                all_reduce_output = torch.ops.trtllm.allreduce(
+                    input, residual_in, gamma, None, None, None,
+                    mapping.tp_group, int(AllReduceStrategy.UB), fusion, eps)
+                finalize_output = torch.ops.trtllm.userbuffers_allreduce_finalize(
+                    all_reduce_output[-1], False)
+                return all_reduce_output[0], finalize_output
+
+            def extra_check_no_quant_pattern(match: Match) -> bool:
+                input = match.ctx.pattern_to_node[input_node]
+                if not isinstance(input, torch.fx.graph.Node):
+                    return False
+                dtype = input.meta["tensor_meta"].dtype
+                # UB only supports FP16/BF16 input
+                if dtype != torch.float16 and dtype != torch.bfloat16:
+                    return False
+                return True
+
+            register_replacement(
+                empty_no_quant_pattern,
+                target_no_quant_pattern,
+                [],
+                fwd_only,
+                custom_pass,
+                search_fn_pattern=no_quant_pattern,
+                extra_check=extra_check_no_quant_pattern,
+            )
+
         register_fp8_quant_pattern(custom_pass)
         register_fp4_quant_pattern(custom_pass)
+        # register_no_quant_pattern(custom_pass)
 
     def register_convert_supported_ar_to_ub(custom_pass: PatternMatcherPass):
         strategy = int(AllReduceStrategy.AUTO)
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -1,3 +1,4 @@
+import logging
 import math
 import os
 import threading
@@ -11,6 +12,7 @@
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
 _thread_local = threading.local()
+logger = logging.getLogger(__name__)
 
 
 def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -349,12 +349,16 @@ def __init__(
                                                 self.model.vocab_size_padded)
 
         self._torch_compile_backend = None
+        print(
+            f"torch_compile_enabled: {pytorch_backend_config.torch_compile_enabled}",
+            flush=True)
 
         try:
             if pytorch_backend_config.torch_compile_enabled:
                 set_torch_compiling(True)
                 use_ub = pytorch_backend_config.torch_compile_enable_userbuffers and self._init_userbuffers(
                     self.model.config.hidden_size)
+                print(f"use_ub: {use_ub}", flush=True)
                 self._torch_compile_backend = Backend(
                     pytorch_backend_config.torch_compile_inductor_enabled,
                     enable_userbuffers=use_ub,

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ void ub_initialize(int tp_size)`
`36`	`36`
`37`	`37`	`bool ub_is_initialized()`
`38`	`38`	`{`
`39`		`- return UserBufferAllocator::Instance().is_initialized();`
	`39`	`+ return UserBufferAllocator::Instance().isInitialized();`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`UBBuffer ub_allocate(size_t bytes)`