diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index dba1960d9ad..e097b11d245 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1825,9 +1825,10 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) multiNodesSBSAConfigs = [ // Each stage test 1 testcase with 8 GPUs and 2 nodes. - "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 3, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 3, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 3, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2], + "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2], ] fullSet += multiNodesSBSAConfigs.keySet() diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 8685bab0cb6..6893b17df28 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -38,6 +38,7 @@ from tqdm import tqdm from transformers import PretrainedConfig +from tensorrt_llm._ipc_utils import can_access_peer from tensorrt_llm._utils import get_sm_version from tensorrt_llm.functional import PositionEmbeddingType from tensorrt_llm.llmapi.utils import enable_llm_debug @@ -604,6 +605,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], self.enable_attention_dp = mapping.enable_attention_dp self.mlp_tp_size = mapping.tp_size + self.is_p2p_supported = can_access_peer(mapping) self.fusion_config = EagerFusionConfig() self.enable_fusion = os.environ.get( @@ -843,11 +845,11 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize): hidden_states, residual) # Note: this fusion pattern is only supported for TRTLLM-nvfp4 backend now - do_finalize = not (hidden_states.shape[0] - <= self.moe_allreduce.max_token - and self.fusion_config.POST_MOE_FUSION - and self.model_config.moe_backend == 'TRTLLM' - and self.mlp.experts.has_nvfp4) + do_finalize = not ( + hidden_states.shape[0] <= self.moe_allreduce.max_token + and self.fusion_config.POST_MOE_FUSION + and self.model_config.moe_backend == 'TRTLLM' + and self.mlp.experts.has_nvfp4 and self.is_p2p_supported) hidden_states = _run_MoE(hidden_states, hidden_states_fp4=None, diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index f8e553f1e68..7065b0b6956 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -15,4 +15,5 @@ l0_gb200_multi_nodes: tests: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[throughput_tp8] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency_trtllmgen] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency] TIMEOUT (180)