diff --git a/examples/wide_ep/README.md b/examples/wide_ep/README.md index 2c4fe5c0758..5aa1cdbc0ca 100644 --- a/examples/wide_ep/README.md +++ b/examples/wide_ep/README.md @@ -96,15 +96,34 @@ GPU memory are also on NUMA nodes on GB200 and system can also use that. Bind me numactl -m 0,1 ``` -### Shared Memory Clean Up on EPLB +### Shared Memory on EPLB -To achieve online load balance, all expert weights are stored in shared host memory. 4 ranks on same GB200 node share the same expert weights to save memory. Normally, these shared host memory will be cleaned up at process exit, but they may not get chance to be cleaned if an abnormal exit happens. +To achieve online load balancing, all expert weights are stored in shared host memory. Four ranks on the same GB200 node share the same expert weights to save memory. -In that case, when seeing the following (or similar) error message: -``` -FileExistsError: [Errno 17] File exists: '/moe_shared_l0_lr0_all' +There is one environment variable `TRTLLM_EPLB_SHM_NAME` to specify the base name of the shared memory. This environment variable may need to be specified if there are multiple instances on one node. If not, you can ignore it. + +The default value of `TRTLLM_EPLB_SHM_NAME` is `moe_shared`. When `TRTLLM_EPLB_SHM_NAME` is set to `moe_shared`, the shared memory segments will be named as `moe_shared_l0_lr0_all`, `moe_shared_l1_lr0_all`, and so on. Here `l0` means the first layer with EPLB, and `lr0` means that it is the part loaded by local rank 0, and `all` means it contains all expert weights of each expert. + +Normally, these shared memory segments will be cleaned up automatically at process exit. However, they may not get the chance to be cleaned up if an abnormal exit occurs. Therefore, EPLB will automatically clean up leftover shared memory with the same name that already exists before creating new segments. + +If you experience an abnormal exit and are concerned about the shared memory usage before the next run, you need to manually check the `/dev/shm` directory and delete any `/dev/shm/moe_shared_*` files if present. + +#### Manual Cleanup Commands + +For manual cleanup of shared memory, you can use the following commands: + +```bash +# List all moe_shared related shared memory +ls -la /dev/shm/moe_shared_* + +# Remove all moe_shared related shared memory +rm -f /dev/shm/moe_shared_* + +# Or remove specific shared memory segments +rm -f /dev/shm/moe_shared_l0_lr0_all ``` -you need to manually check `/dev/shm` directory and delete `/dev/shm/moe_shared_*` if any. + +**Warning:** Be careful when removing shared memory manually, as this may affect running processes that depend on these shared memory segments. ### Disaggregated serving related issues diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py index ff26c87687a..530927d9ba7 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py +++ b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py @@ -1,3 +1,4 @@ +import os import threading from contextlib import nullcontext from multiprocessing import resource_tracker, shared_memory @@ -176,9 +177,20 @@ def finalize_layer_weights(self): total_size += aligned_size shm_name = self.get_shared_memory_name() - shm = shared_memory.SharedMemory(name=shm_name, - create=True, - size=total_size) + try: + shm = shared_memory.SharedMemory(name=shm_name, + create=True, + size=total_size) + except FileExistsError: + tensorrt_llm.logger.warning( + f'Found exist EPLB shared memory name: {shm_name}, unlinking...' + ) + existing_shm = shared_memory.SharedMemory(name=shm_name) + existing_shm.close() + existing_shm.unlink() + shm = shared_memory.SharedMemory(name=shm_name, + create=True, + size=total_size) self.own_shm = shm offset = 0 @@ -722,7 +734,7 @@ def __init__(self, ep_rank: int, ep_size: int, layer_updates_per_iter: int, - shared_memory_base_name: str = 'moe_shared'): + shared_memory_base_name: Optional[str] = None): """ Initialize a MoeLoadBalancer instance. @@ -730,7 +742,7 @@ def __init__(self, ep_rank: The rank of the current process in expert parallelism ep_size: The total number of processes in expert parallelism layer_updates_per_iter: The number of layers to update per iteration - shared_memory_base_name: Shared memory base name + shared_memory_base_name: Shared memory base name, will use 'moe_shared' if None """ self.is_shutdown = True self.ep_rank = ep_rank @@ -740,7 +752,8 @@ def __init__(self, layer_updates_per_iter) self._previous_balancer = None self.single_layer_load_balancers = [] - self.shared_memory_base_name = shared_memory_base_name + self.shared_memory_base_name = shared_memory_base_name or os.getenv( + 'TRTLLM_EPLB_SHM_NAME', 'moe_shared') self._setup_mpi_comm() self.is_shutdown = False diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8997bf7d0ab..0649e6bb1fc 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -307,8 +307,6 @@ accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbug accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143) accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090) accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False] SKIP (https://nvbugs/5483534) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5488118) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5488118) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5488118)