[Issue 5927][fix] Avoid memory calls during broadcast for single GPU (#6010)

johncalesp · web-flow · commit fc8b29c4fffb · 2025-07-18T14:21:03.000-07:00
Signed-off-by: John Calderon &lt;johncalesp@gmail.com&gt;
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
@@ -509,7 +509,7 @@ def mpi_barrier():
 
 
 def mpi_broadcast(obj, root=0):
-    return mpi_comm().bcast(obj, root) if ENABLE_MULTI_DEVICE else obj
+    return mpi_comm().bcast(obj, root) if is_multi_device_enable() else obj
 
 
 def mpi_allgather(obj):
@@ -1079,3 +1079,14 @@ def _unique_tokens_to_json(data):
             "token_id": data.token_id,
             "token_extra_id": data.token_extra_id
         }
+
+
+def is_multi_device_enable():
+    """
+    This method evaluates if we are running on multiple GPUs and the flag ENABLE_MULTI_DEVICE is set.
+    So we can avoid broadcast calls on single GPU.
+    Issue: https://github.com/NVIDIA/TensorRT-LLM/issues/5927
+    ENABLE_MULTI_DEVICE is true by default when building tensorrt-llm so we need to also check
+    the number of devices
+    """
+    return local_mpi_size() > 1