Skip to content

Commit 71a3ec4

Browse files
committed
[Bugfix][P/D] Reduce num_threads used by NIXL UCX backend
This fixes an issue in RDMA environments where we ran out of UAR space while initializing nvshmem for DeepEP, when used in conjunction with P/D. Signed-off-by: David Whyte-Gray <[email protected]>
1 parent 1c691f4 commit 71a3ec4

File tree

1 file changed

+11
-1
lines changed

1 file changed

+11
-1
lines changed

vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -541,15 +541,25 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
541541
# TODO temporary, once nixl allows for telemetry flag in config
542542
# (next release), we can remove this env var.
543543
os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
544+
544545
# Agent.
545546
non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
547+
# Configure NIXL num_threads to avoid UAR exhaustion on Mellanox NICs.
548+
# Each UCX thread allocates UARs (doorbell pages) via DevX, and
549+
# excessive NIXL UAR usage can exhaust NIC UAR space. This can cause
550+
# components like NVSHMEM (used by DeepEP kernels) to fail during RDMA
551+
# initialization with "mlx5dv_devx_alloc_uar" errors.
552+
# Ref: https://network.nvidia.com/files/doc-2020/ethernet-adapters-programming-manual.pdf#page=63
553+
num_threads = vllm_config.kv_transfer_config.get_from_extra_config(
554+
"num_threads", 4
555+
)
546556
if nixl_agent_config is None:
547557
config = None
548558
else:
549559
config = (
550560
nixl_agent_config(backends=self.nixl_backends)
551561
if len(non_ucx_backends) > 0
552-
else nixl_agent_config(num_threads=8)
562+
else nixl_agent_config(num_threads=num_threads)
553563
)
554564

555565
self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)

0 commit comments

Comments
 (0)