NVIDIA · dongxuy04 · Sep 21, 2025 · Sep 3, 2025
@@ -96,15 +96,34 @@ GPU memory are also on NUMA nodes on GB200 and system can also use that. Bind me
 numactl -m 0,1 <command>
 ```
 
-### Shared Memory Clean Up on EPLB
+### Shared Memory on EPLB
 
-To achieve online load balance, all expert weights are stored in shared host memory. 4 ranks on same GB200 node share the same expert weights to save memory. Normally, these shared host memory will be cleaned up at process exit, but they may not get chance to be cleaned if an abnormal exit happens.
+To achieve online load balancing, all expert weights are stored in shared host memory. Four ranks on the same GB200 node share the same expert weights to save memory.
 
-In that case, when seeing the following (or similar) error message:
-```
-FileExistsError: [Errno 17] File exists: '/moe_shared_l0_lr0_all'
+There is one environment variable `TRTLLM_EPLB_SHM_NAME` to specify the base name of the shared memory. This environment variable may need to be specified if there are multiple instances on one node. If not, you can ignore it.
+
+The default value of `TRTLLM_EPLB_SHM_NAME` is `moe_shared`. When `TRTLLM_EPLB_SHM_NAME` is set to `moe_shared`, the shared memory segments will be named as `moe_shared_l0_lr0_all`, `moe_shared_l1_lr0_all`, and so on. Here `l0` means the first layer with EPLB, and `lr0` means that it is the part loaded by local rank 0, and `all` means it contains all expert weights of each expert.
+
+Normally, these shared memory segments will be cleaned up automatically at process exit. However, they may not get the chance to be cleaned up if an abnormal exit occurs. Therefore, EPLB will automatically clean up leftover shared memory with the same name that already exists before creating new segments.
+
+If you experience an abnormal exit and are concerned about the shared memory usage before the next run, you need to manually check the `/dev/shm` directory and delete any `/dev/shm/moe_shared_*` files if present.
+
+#### Manual Cleanup Commands
+
+For manual cleanup of shared memory, you can use the following commands:
+
+```bash
+# List all moe_shared related shared memory
+ls -la /dev/shm/moe_shared_*
+
+# Remove all moe_shared related shared memory
+rm -f /dev/shm/moe_shared_*
+
+# Or remove specific shared memory segments
+rm -f /dev/shm/moe_shared_l0_lr0_all
 ```
-you need to manually check `/dev/shm` directory and delete `/dev/shm/moe_shared_*` if any.
+
+**Warning:** Be careful when removing shared memory manually, as this may affect running processes that depend on these shared memory segments.
 
 ### Disaggregated serving related issues
 

@@ -1,3 +1,4 @@
+import os
 import threading
 from contextlib import nullcontext
 from multiprocessing import resource_tracker, shared_memory
@@ -176,9 +177,20 @@ def finalize_layer_weights(self):
                 total_size += aligned_size
 
         shm_name = self.get_shared_memory_name()
-        shm = shared_memory.SharedMemory(name=shm_name,
-                                         create=True,
-                                         size=total_size)
+        try:
+            shm = shared_memory.SharedMemory(name=shm_name,
+                                             create=True,
+                                             size=total_size)
+        except FileExistsError:
+            tensorrt_llm.logger.warning(
+                f'Found exist EPLB shared memory name: {shm_name}, unlinking...'
+            )
+            existing_shm = shared_memory.SharedMemory(name=shm_name)
+            existing_shm.close()
+            existing_shm.unlink()
+            shm = shared_memory.SharedMemory(name=shm_name,
+                                             create=True,
+                                             size=total_size)
         self.own_shm = shm
 
         offset = 0
@@ -722,15 +734,15 @@ def __init__(self,
                  ep_rank: int,
                  ep_size: int,
                  layer_updates_per_iter: int,
-                 shared_memory_base_name: str = 'moe_shared'):
+                 shared_memory_base_name: Optional[str] = None):
         """
         Initialize a MoeLoadBalancer instance.
 
         Args:
             ep_rank: The rank of the current process in expert parallelism
             ep_size: The total number of processes in expert parallelism
             layer_updates_per_iter: The number of layers to update per iteration
-            shared_memory_base_name: Shared memory base name
+            shared_memory_base_name: Shared memory base name, will use 'moe_shared' if None
         """
         self.is_shutdown = True
         self.ep_rank = ep_rank
@@ -740,7 +752,8 @@ def __init__(self,
                                                        layer_updates_per_iter)
         self._previous_balancer = None
         self.single_layer_load_balancers = []
-        self.shared_memory_base_name = shared_memory_base_name
+        self.shared_memory_base_name = shared_memory_base_name or os.getenv(
+            'TRTLLM_EPLB_SHM_NAME', 'moe_shared')
         self._setup_mpi_comm()
         self.is_shutdown = False
 

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -307,8 +307,6 @@ accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbug
 accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143)
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090)
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=False] SKIP (https://nvbugs/5483534)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] SKIP (https://nvbugs/5488118)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5488118)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5488118)