NVIDIA · yizhang-nv · Jul 14, 2025 · Jul 10, 2025
@@ -1825,9 +1825,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
 
     multiNodesSBSAConfigs = [
         // Each stage test 1 testcase with 8 GPUs and 2 nodes.
-        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 3, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 3, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 3, 8, 2],
+        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
+        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
+        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
+        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
     ]
     fullSet += multiNodesSBSAConfigs.keySet()
 

@@ -38,6 +38,7 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
+from tensorrt_llm._ipc_utils import can_access_peer
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.llmapi.utils import enable_llm_debug
@@ -604,6 +605,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
         self.enable_attention_dp = mapping.enable_attention_dp
 
         self.mlp_tp_size = mapping.tp_size
+        self.is_p2p_supported = can_access_peer(mapping)
 
         self.fusion_config = EagerFusionConfig()
         self.enable_fusion = os.environ.get(
@@ -843,11 +845,11 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
                     hidden_states, residual)
 
             # Note: this fusion pattern is only supported for TRTLLM-nvfp4 backend now
-            do_finalize = not (hidden_states.shape[0]
-                               <= self.moe_allreduce.max_token
-                               and self.fusion_config.POST_MOE_FUSION
-                               and self.model_config.moe_backend == 'TRTLLM'
-                               and self.mlp.experts.has_nvfp4)
+            do_finalize = not (
+                hidden_states.shape[0] <= self.moe_allreduce.max_token
+                and self.fusion_config.POST_MOE_FUSION
+                and self.model_config.moe_backend == 'TRTLLM'
+                and self.mlp.experts.has_nvfp4 and self.is_p2p_supported)
 
             hidden_states = _run_MoE(hidden_states,
                                      hidden_states_fp4=None,

@@ -15,4 +15,5 @@ l0_gb200_multi_nodes:
   tests:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[throughput_tp8] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency] TIMEOUT (180)