Fix binding errors.

timlee0212 · timlee0212 · commit 9af99ec26ab6 · 2025-09-15T09:39:43.000-07:00
Signed-off-by: Shiyu Li &lt;shili@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -359,8 +359,9 @@ void initBindings(nb::module_& m)
         nb::call_guard<nb::gil_scoped_release>());
 
     nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
-        .def(nb::init<size_t, uint32_t, uint32_t, uint32_t, at::Device, bool>(),
-            nb::call_guard<nb::gil_scoped_release>())
+        .def(nb::init<size_t, uint32_t, uint32_t, uint32_t, uint32_t, bool>(), nb::arg("buf_size"),
+            nb::arg("group_size"), nb::arg("group_rank"), nb::arg("split_color"), nb::arg("device_idx"),
+            nb::arg("mn_nvlink"), nb::call_guard<nb::gil_scoped_release>())
         .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer,
             nb::call_guard<nb::gil_scoped_release>())
         .def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer,
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -395,9 +395,8 @@ void initBindings(pybind11::module_& m)
         .def("finalize", &tr::GptDecoderBatched::finalize, py::arg("decoder_state"), py::arg("batch_idx"),
             py::arg("sampling_config"), py::arg("streaming"), py::call_guard<py::gil_scoped_release>())
         .def_property_readonly(
-            "decoder_stream",
-            [](tr::GptDecoderBatched& self) -> tr::CudaStream const& { return *self.getDecoderStream(); },
-            py::return_value_policy::reference);
+            "decoder_stream", [](tr::GptDecoderBatched& self) -> tr::CudaStream const&
+            { return *self.getDecoderStream(); }, py::return_value_policy::reference);
 
     m.def(
         "lamport_initialize_all",
@@ -408,8 +407,7 @@ void initBindings(pybind11::module_& m)
         },
         "Lamport initialize all buffers", py::call_guard<py::gil_scoped_release>());
     m.def(
-        "lamport_initialize",
-        [](intptr_t buffer, size_t size)
+        "lamport_initialize", [](intptr_t buffer, size_t size)
         { tensorrt_llm::kernels::ar_fusion::lamport_initialize(reinterpret_cast<void*>(buffer), size, 0); },
         "Lmaport initialize buffer", py::call_guard<py::gil_scoped_release>());
     m.def(
@@ -455,7 +453,9 @@ void initBindings(pybind11::module_& m)
         py::call_guard<py::gil_scoped_release>());
 
     py::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
-        .def(py::init<size_t, uint32_t, uint32_t, uint32_t, at::Device, bool>(), py::call_guard<py::gil_scoped_release>())
+        .def(py::init<size_t, uint32_t, uint32_t, uint32_t, uint32_t, bool>(), py::arg("buf_size"),
+            py::arg("group_size"), py::arg("group_rank"), py::arg("split_color"), py::arg("device_idx"),
+            py::arg("mn_nvlink"), py::call_guard<py::gil_scoped_release>())
         .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer,
             py::call_guard<py::gil_scoped_release>())
         .def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer,
diff --git a/cpp/tensorrt_llm/runtime/mcastGPUBuffer.h b/cpp/tensorrt_llm/runtime/mcastGPUBuffer.h
@@ -38,10 +38,10 @@ class McastGPUBuffer
     //! \param device The CUDA device for buffer allocation.
     //! \param mnNvlink Flag indicating if multi-node NVLink is used.
     McastGPUBuffer(
-        size_t bufSize, uint32_t groupSize, uint32_t groupRank, uint32_t splitColor, at::Device device, bool mnNvlink)
-        : mMcastDeviceMemory(bufSize, groupSize, groupRank, splitColor, device.index(), mnNvlink)
+        size_t bufSize, uint32_t groupSize, uint32_t groupRank, uint32_t splitColor, uint32_t deviceIdx, bool mnNvlink)
+        : mMcastDeviceMemory(bufSize, groupSize, groupRank, splitColor, deviceIdx, mnNvlink)
         , mBufSize(bufSize)
-        , mLocalDevice(device)
+        , mLocalDevice(at::Device(at::DeviceType::CUDA, deviceIdx))
     {
     }
 
@@ -51,7 +51,7 @@ class McastGPUBuffer
     //! \param dtype The data type of the tensor elements.
     //! \param storageOffset The offset in elements from the start of the buffer.
     //! \return An ATen tensor wrapping the unicast buffer section.
-    at::Tensor getUCBuffer(uint32_t rank, c10::IntArrayRef sizes, c10::ScalarType dtype, int64_t storageOffset)
+    at::Tensor getUCBuffer(uint32_t rank, std::vector<long int> sizes, torch::ScalarType dtype, int64_t storageOffset)
     {
         size_t const numel = std::accumulate(sizes.begin(), sizes.end(), 1UL, std::multiplies<size_t>());
         size_t const elementSize = c10::elementSize(dtype);
@@ -61,15 +61,18 @@ class McastGPUBuffer
         auto* dataPtr = static_cast<uint8_t*>(mMcastDeviceMemory.getUnicastPtr(rank)) + storageOffset * elementSize;
 
         auto options = at::TensorOptions().dtype(dtype).device(mLocalDevice);
-        return at::for_blob(dataPtr, sizes).options(options).target_device(mLocalDevice).make_tensor();
+        return at::for_blob(dataPtr, c10::IntArrayRef(sizes))
+            .options(options)
+            .target_device(mLocalDevice)
+            .make_tensor();
     }
 
     //! \brief Returns a PyTorch tensor view of the multicast buffer portion.
     //! \param sizes The desired shape (dimensions) of the tensor.
     //! \param dtype The data type of the tensor elements.
     //! \param storageOffset The offset in elements from the start of the buffer.
     //! \return An ATen tensor wrapping the multicast buffer section.
-    at::Tensor getMCBuffer(c10::IntArrayRef sizes, c10::ScalarType dtype, int64_t storageOffset)
+    at::Tensor getMCBuffer(std::vector<long int> sizes, torch::ScalarType dtype, int64_t storageOffset)
     {
         size_t const numel = std::accumulate(sizes.begin(), sizes.end(), 1UL, std::multiplies<size_t>());
         size_t const elementSize = c10::elementSize(dtype);
@@ -79,7 +82,10 @@ class McastGPUBuffer
         auto* dataPtr = static_cast<uint8_t*>(mMcastDeviceMemory.getMulticastPtr()) + storageOffset * elementSize;
 
         auto options = at::TensorOptions().dtype(dtype).device(mLocalDevice);
-        return at::for_blob(dataPtr, sizes).options(options).target_device(mLocalDevice).make_tensor();
+        return at::for_blob(dataPtr, c10::IntArrayRef(sizes))
+            .options(options)
+            .target_device(mLocalDevice)
+            .make_tensor();
     }
 
 private:
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -1,4 +1,3 @@
-import logging
 import math
 import os
 import platform
@@ -17,7 +16,6 @@
 from tensorrt_llm.plugin.plugin import CustomAllReduceHelper
 
 _thread_local = threading.local()
-logger = logging.getLogger(__name__)
 
 
 def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
@@ -61,8 +59,9 @@ def get_allreduce_mnnvl_workspace(
         setattr(_thread_local, f'allreduce_mnnvl_workspaces_{mapping.pp_rank}',
                 {})
     # Support topology split
-    comm = mpi_comm().Split(mapping.pp_rank * mapping.cp_size + mapping.cp_rank,
-                            mapping.tp_rank)
+    comm = mpi_comm().Split(
+        int(mapping.pp_rank * mapping.cp_size + mapping.cp_rank),
+        mapping.tp_rank)
     force_mn = os.environ.get("TRTLLM_FORCE_MNNVL_AR", "0") == "1"
 
     allreduce_mnnvl_workspaces = getattr(
@@ -82,7 +81,7 @@ def get_allreduce_mnnvl_workspace(
             mapping.tp_rank,
             # Split the communicator according to the topology
             mapping.pp_rank * mapping.cp_size + mapping.cp_rank,
-            torch.device("cuda", mapping.local_rank),
+            mapping.local_rank,
             True,  # mnNvlink
         )
 
@@ -463,12 +462,7 @@ def __init__(self,
             # Initialize MNNVL AllReduce if needed
             if self.strategy in (AllReduceStrategy.AUTO,
                                  AllReduceStrategy.MNNVL):
-                if self.mapping.tp_size != self.mapping.world_size:
-                    logger.debug(
-                        f"MNNVLAllReduce is disabled due to tp_size:{self.mapping.tp_size} "
-                        f"!= world_size:{self.mapping.world_size}")
-                    self.mnnvl_allreduce = None
-                elif MNNVLAllReduce.is_mnnvl(self.mapping, dtype):
+                if MNNVLAllReduce.is_mnnvl(self.mapping, dtype):
                     try:
                         self.mnnvl_allreduce = MNNVLAllReduce(
                             self.mapping, dtype) if dtype else None
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -775,7 +775,7 @@ def _compute_mlp_tp_size(self, intermediate_size: int,
                 mlp_tp_size = math.gcd(
                     tp,
                     self.mapping.gpus_per_node,
-                )  # Avoid costly inter-node TP when MNNVL is not supported
+                )  # Avoid costly inter-node TP
             else:
                 mlp_tp_size = tp
         return mlp_tp_size