Enable NCCL symmetric for non-torch compile path

Tabrizian · Tabrizian · commit 966136052068 · 2025-07-18T16:47:36.000-07:00
Signed-off-by: Iman Tabrizian &lt;10105175+tabrizian@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h
@@ -56,6 +56,8 @@ enum class AllReduceStrategyType : int8_t
     ONESHOT = 4,
     TWOSHOT = 5,
     LOWPRECISION = 6,
+    MNNVL = 7,
+    NCCL_SYMMETRIC = 8,
 };
 
 enum class AllReduceStrategyConfig : int8_t
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp
@@ -21,9 +21,7 @@ namespace tensorrt_llm::runtime::ub
 {
 UserBufferAllocator& UserBufferAllocator::Instance()
 {
-    // if environment variable TLLM_USE_NCCL_UB is set to 1, use NCCLUserBufferAllocator
-    char* useNCCLUB = std::getenv("TLLM_USE_NCCL_UB");
-    if (useNCCLUB != nullptr)
+    if (use_nccl_symmetric)
     {
         static NCCLUserBufferAllocator _;
         return _;
@@ -110,4 +108,6 @@ UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)
     return ub_buffer;
 }
 
+bool UserBufferAllocator::use_nccl_symmetric = false;
+
 }; // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h
@@ -59,6 +59,8 @@ class UserBufferAllocator
     communicator* comm();
     virtual UBBuffer registerUBBuffer(size_t bytes);
 
+    static bool use_nccl_symmetric;
+
 private:
     communicator* mUbComm;
 
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.cpp
@@ -29,11 +29,12 @@ UserBuffersManager& UserBuffersManager::get_instance()
     return allocator;
 }
 
-void UserBuffersManager::initialize(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
+void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
 {
     std::lock_guard<std::mutex> lock(mutex_);
     tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);
+    UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;
     tensorrt_llm::runtime::ub::ub_initialize(world_config);
     TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());
     buffer_size_ = buffer_size;
@@ -95,10 +96,11 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()
     return tensorrt_llm::runtime::ub::ub_comm();
 }
 
-void initialize_userbuffers_manager(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)
+void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)
 {
-    UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);
+    UserBuffersManager::get_instance().initialize(
+        tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);
 }
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffersManager.h
@@ -46,8 +46,9 @@ class UserBuffersManager
     //! @param gpus_per_node The number of GPUs per node.
     //! @param buffer_size The size of the buffer to allocate. All buffers allocated by this manager will have this
     //! size.
-    void initialize(
-        int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
+    //! @param use_nccl_symmetric Whether to use NCCL symmetric communication.
+    void initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node,
+        int64_t buffer_size, bool use_nccl_symmetric);
 
     //! @brief Create a UB tensor from the given shape, strides and data type. The function will choose available UB
     //! buffer or create a new one if no available buffer is found.
@@ -75,7 +76,7 @@ class UserBuffersManager
     int64_t buffer_size_;
 };
 
-void initialize_userbuffers_manager(
-    int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size);
+void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,
+    int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric);
 
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -448,7 +448,8 @@ void initBindings(pybind11::module_& m)
         .value("AUTO", tensorrt_llm::kernels::AllReduceStrategyType::AUTO)
         .value("UB", tensorrt_llm::kernels::AllReduceStrategyType::UB)
         .value("ONESHOT", tensorrt_llm::kernels::AllReduceStrategyType::ONESHOT)
-        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT);
+        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT)
+        .value("NCCL_SYMMETRIC", tensorrt_llm::kernels::AllReduceStrategyType::NCCL_SYMMETRIC);
 
     // Initialize MoeLoadBalancer bindings
     initMoeBindings(m);
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -166,10 +166,6 @@ class AllreduceOp
         size_t bytes_per_element = input.element_size();
         TLLM_LOG_DEBUG("All reduce message size is %zu", size * bytes_per_element);
 
-        if (std::getenv("TLLM_USE_NCCL_UB") && mStrategy == AllReduceStrategyType::UB)
-        {
-            return runNCCLAllReduceUB(input, residual, norm_weight, scale, bias);
-        }
         AllReduceStrategyType runtime_strategy = getRuntimeStrategy(seq_len, size);
 
         // Log runtime strategy
@@ -181,6 +177,8 @@ class AllreduceOp
         {
         case AllReduceStrategyType::UB: return runUBAllReduce(input, residual, norm_weight, scale, bias);
         case AllReduceStrategyType::NCCL: return runNCCLAllReduce(input, residual, norm_weight, scale, bias);
+        case AllReduceStrategyType::NCCL_SYMMETRIC:
+            return runNCCLAllReduceSymmetric(input, residual, norm_weight, scale, bias);
         case AllReduceStrategyType::MIN_LATENCY:
         case AllReduceStrategyType::ONESHOT:
         case AllReduceStrategyType::TWOSHOT:
@@ -307,7 +305,7 @@ class AllreduceOp
         return fallbackRunSubsequentOps(input, residual, norm_weight, scale, bias, reduce_output);
     }
 
-    std::vector<torch::Tensor> runNCCLAllReduceUB(torch::Tensor const& input,
+    std::vector<torch::Tensor> runNCCLAllReduceSymmetric(torch::Tensor const& input,
         torch::optional<torch::Tensor> const& residual, torch::optional<torch::Tensor> const& norm_weight,
         torch::optional<torch::Tensor> const& scale, torch::optional<torch::Tensor> const& bias) noexcept
     {
@@ -316,11 +314,20 @@ class AllreduceOp
         int size = input.numel();
         auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance();
         auto ub_buffer0 = ub_manager.search_buffer(input.data_ptr());
+        if (ub_buffer0.invalid())
+        {
+            auto [symmetric_input, symmetric_ub_buffer0]
+                = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
+            cudaMemcpyAsync(symmetric_ub_buffer0.addr, input.data_ptr(), size * input.element_size(),
+                cudaMemcpyDeviceToDevice, stream);
+            ub_buffer0 = symmetric_ub_buffer0;
+        }
+
         TLLM_CHECK(!ub_buffer0.invalid());
         auto [norm_out, ub_buffer1] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type());
 
         NCCLCHECK(ncclAllReduce(
-            input.data_ptr(), norm_out.mutable_data_ptr(), size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
+            ub_buffer0.addr, norm_out.mutable_data_ptr(), size, (*getDtypeMap())[mType], ncclSum, *mNcclComm, stream));
 
         if (mOp == AllReduceFusionOp::NONE)
         {
@@ -661,6 +668,10 @@ class AllreduceOp
         {
             runtime_strategy = AllReduceStrategyType::NCCL;
         }
+        else if (mStrategy == AllReduceStrategyType::NCCL_SYMMETRIC)
+        {
+            runtime_strategy = AllReduceStrategyType::NCCL_SYMMETRIC;
+        }
         else
         {
             // This is for DEBUG and BENCHMARK purpose. It will overried the strategy if AUTO is set.
@@ -686,6 +697,11 @@ class AllreduceOp
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL", rank);
             break;
         }
+        case AllReduceStrategyType::NCCL_SYMMETRIC:
+        {
+            TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: NCCL_SYMMETRIC", rank);
+            break;
+        }
         case AllReduceStrategyType::MIN_LATENCY:
         {
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: MIN_LATENCY", rank);
@@ -701,7 +717,7 @@ class AllreduceOp
             TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: LOWPRECISION", rank);
             break;
         }
-        default: break;
+        default: TLLM_LOG_DEBUG("AllReducePlugin strategy for rank %d: UNKNOWN: %d", rank, strategy); break;
         }
     }
 
diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
@@ -66,7 +66,6 @@ def get_custom_pass(cls, enable_userbuffers):
                 register_ar_residual_norm(cls._custom_pass_instances[0])
                 if enable_userbuffers and tensorrt_llm.bindings.internal.userbuffers.ub_supported(
                 ):
-                    print("Registering UB patterns", flush=True)
                     register_ub_patterns(cls._custom_pass_instances)
             else:
                 register_add_norm(cls._custom_pass_instances[0])
diff --git a/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py b/tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py
@@ -184,61 +184,8 @@ def extra_check_fp4_quant_pattern(match: Match) -> bool:
                 extra_check=extra_check_fp4_quant_pattern,
             )
 
-        def register_no_quant_pattern(custom_pass: PatternMatcherPass):
-            input_node = KeywordArg('input')
-            fusion = KeywordArg('fusion_op')
-            trtllm_allreduce_default = CallFunction(
-                torch.ops.trtllm.allreduce.default, input_node,
-                KeywordArg('residual_in'), KeywordArg('gamma'), Ignored(),
-                Ignored(), Ignored(), mapping.tp_group, strategy, fusion,
-                KeywordArg('eps'))
-            no_quant_pattern = MultiOutputPattern([trtllm_allreduce_default])
-
-            def empty_no_quant_pattern(
-                input: torch.Tensor,
-                residual_in: torch.Tensor,
-                gamma: torch.Tensor,
-                eps: float,
-            ):
-                return
-
-            def target_no_quant_pattern(
-                input: torch.Tensor,
-                residual_in: torch.Tensor,
-                gamma: torch.Tensor,
-                eps: float,
-            ):
-                input = torch.ops.trtllm.copy_to_userbuffers(input)
-                all_reduce_output = torch.ops.trtllm.allreduce(
-                    input, residual_in, gamma, None, None, None,
-                    mapping.tp_group, int(AllReduceStrategy.UB), fusion, eps)
-                finalize_output = torch.ops.trtllm.userbuffers_allreduce_finalize(
-                    all_reduce_output[-1], False)
-                return all_reduce_output[0], finalize_output
-
-            def extra_check_no_quant_pattern(match: Match) -> bool:
-                input = match.ctx.pattern_to_node[input_node]
-                if not isinstance(input, torch.fx.graph.Node):
-                    return False
-                dtype = input.meta["tensor_meta"].dtype
-                # UB only supports FP16/BF16 input
-                if dtype != torch.float16 and dtype != torch.bfloat16:
-                    return False
-                return True
-
-            register_replacement(
-                empty_no_quant_pattern,
-                target_no_quant_pattern,
-                [],
-                fwd_only,
-                custom_pass,
-                search_fn_pattern=no_quant_pattern,
-                extra_check=extra_check_no_quant_pattern,
-            )
-
         register_fp8_quant_pattern(custom_pass)
         register_fp4_quant_pattern(custom_pass)
-        # register_no_quant_pattern(custom_pass)
 
     def register_convert_supported_ar_to_ub(custom_pass: PatternMatcherPass):
         strategy = int(AllReduceStrategy.AUTO)
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
@@ -125,7 +125,8 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
                 "ONESHOT": AllReduceStrategy.ONESHOT,
                 "TWOSHOT": AllReduceStrategy.TWOSHOT,
                 "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
-                "MNNVL": AllReduceStrategy.MNNVL
+                "MNNVL": AllReduceStrategy.MNNVL,
+                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC
             }
             key = strategy.upper()
             return maps[key] if key in maps else AllReduceStrategy.AUTO
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -311,16 +311,14 @@ def __init__(
         self._init_model_capacity()
 
         self._torch_compile_backend = None
-        print(
-            f"torch_compile_enabled: {pytorch_backend_config.torch_compile_enabled}",
-            flush=True)
 
         try:
+            if pytorch_backend_config.allreduce_strategy == "NCCL_SYMMETRIC":
+                self._init_userbuffers(self.model.config.hidden_size)
             if pytorch_backend_config.torch_compile_enabled:
                 set_torch_compiling(True)
                 use_ub = pytorch_backend_config.torch_compile_enable_userbuffers and self._init_userbuffers(
                     self.model.config.hidden_size)
-                print(f"use_ub: {use_ub}", flush=True)
                 self._torch_compile_backend = Backend(
                     pytorch_backend_config.torch_compile_inductor_enabled,
                     enable_userbuffers=use_ub,
@@ -932,7 +930,6 @@ def _load_model(self,
                     moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
                     lora_config: Optional[LoraConfig] = None,
                     **kwargs):
-
         config = checkpoint_loader.load_config(
             checkpoint_dir,
             trust_remote_code=True,
@@ -2133,12 +2130,11 @@ def _init_userbuffers(self, hidden_size):
         # Disable UB for unsupported platforms
         if not ub.ub_supported():
             return False
-        ub.initialize_userbuffers_manager(self.mapping.tp_size,
-                                          self.mapping.pp_size,
-                                          self.mapping.cp_size,
-                                          self.mapping.rank,
-                                          self.mapping.gpus_per_node,
-                                          hidden_size * self.max_num_tokens * 2)
+        ub.initialize_userbuffers_manager(
+            self.mapping.tp_size, self.mapping.pp_size, self.mapping.cp_size,
+            self.mapping.rank, self.mapping.gpus_per_node,
+            hidden_size * self.max_num_tokens * 2, True)
+
         return True
 
     def load_weights_from_target_model(self,
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
@@ -3882,6 +3882,7 @@ class AllReduceStrategy(IntEnum):
     TWOSHOT = 5
     LOWPRECISION = 6
     MNNVL = 7
+    NCCL_SYMMETRIC = 8
 
 
 class AllReduceFusionOp(IntEnum):
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1876,11 +1876,11 @@ class TorchLlmArgs(BaseLlmArgs):
         description="If true, force dynamic quantization. Defaults to False.",
     )
 
-    allreduce_strategy: Optional[
-        Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
-                'LOWPRECISION',
-                'MNNVL']] = Field(default='AUTO',
-                                  description="Allreduce strategy to use.")
+    allreduce_strategy: Optional[Literal[
+        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
+        'LOWPRECISION', 'MNNVL',
+        'NCCL_SYMMETRIC']] = Field(default='AUTO',
+                                   description="Allreduce strategy to use.")
     checkpoint_loader: Optional[object] = Field(
         default=None,
         description="The checkpoint loader to use for this LLM instance.",
diff --git a/tests/unittest/_torch/multi_gpu/test_user_buffers.py b/tests/unittest/_torch/multi_gpu/test_user_buffers.py
@@ -35,7 +35,8 @@
 
 def init_userbuffers_allocator(tp_size, rank, max_ub_size):
     ub.initialize_userbuffers_manager(tp_size, 1, 1, rank,
-                                      torch.cuda.device_count(), max_ub_size)
+                                      torch.cuda.device_count(), max_ub_size,
+                                      False)
 
 
 def create_userbuffers_tensor(shape, dtype):

Original file line number	Diff line number	Diff line change
`@@ -21,9 +21,7 @@ namespace tensorrt_llm::runtime::ub`
`21`	`21`	`{`
`22`	`22`	`UserBufferAllocator& UserBufferAllocator::Instance()`
`23`	`23`	`{`
`24`		`- // if environment variable TLLM_USE_NCCL_UB is set to 1, use NCCLUserBufferAllocator`
`25`		`- char* useNCCLUB = std::getenv("TLLM_USE_NCCL_UB");`
`26`		`- if (useNCCLUB != nullptr)`
	`24`	`+ if (use_nccl_symmetric)`
`27`	`25`	`{`
`28`	`26`	`static NCCLUserBufferAllocator _;`
`29`	`27`	`return _;`
`@@ -110,4 +108,6 @@ UBBuffer NCCLUserBufferAllocator::registerUBBuffer(size_t bytes)`
`110`	`108`	`return ub_buffer;`
`111`	`109`	`}`
`112`	`110`
	`111`	`+bool UserBufferAllocator::use_nccl_symmetric = false;`
	`112`	`+`
`113`	`113`	`}; // namespace tensorrt_llm::runtime::ub`
Original file line number	Diff line number	Diff line change
`@@ -29,11 +29,12 @@ UserBuffersManager& UserBuffersManager::get_instance()`
`29`	`29`	`return allocator;`
`30`	`30`	`}`
`31`	`31`
`32`		`-void UserBuffersManager::initialize(`
`33`		`- int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)`
	`32`	`+void UserBuffersManager::initialize(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,`
	`33`	`+ int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)`
`34`	`34`	`{`
`35`	`35`	`std::lock_guard<std::mutex> lock(mutex_);`
`36`	`36`	`tensorrt_llm::runtime::WorldConfig world_config(tp_size, pp_size, cp_size, rank, gpus_per_node);`
	`37`	`+ UserBufferAllocator::Instance().use_nccl_symmetric = use_nccl_symmetric;`
`37`	`38`	`tensorrt_llm::runtime::ub::ub_initialize(world_config);`
`38`	`39`	`TLLM_CHECK(tensorrt_llm::runtime::ub::ub_is_initialized());`
`39`	`40`	`buffer_size_ = buffer_size;`
`@@ -95,10 +96,11 @@ tensorrt_llm::runtime::ub::communicator* UserBuffersManager::comm()`
`95`	`96`	`return tensorrt_llm::runtime::ub::ub_comm();`
`96`	`97`	`}`
`97`	`98`
`98`		`-void initialize_userbuffers_manager(`
`99`		`- int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank, int64_t gpus_per_node, int64_t buffer_size)`
	`99`	`+void initialize_userbuffers_manager(int64_t tp_size, int64_t pp_size, int64_t cp_size, int64_t rank,`
	`100`	`+ int64_t gpus_per_node, int64_t buffer_size, bool use_nccl_symmetric)`
`100`	`101`	`{`
`101`		`- UserBuffersManager::get_instance().initialize(tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size);`
	`102`	`+ UserBuffersManager::get_instance().initialize(`
	`103`	`+ tp_size, pp_size, cp_size, rank, gpus_per_node, buffer_size, use_nccl_symmetric);`
`102`	`104`	`}`
`103`	`105`
`104`	`106`	`} // namespace tensorrt_llm::runtime::ub`