Skip to content

Commit 2d46dda

Browse files
yuxianqchzblych
authored andcommitted
[https://nvbugs/5448754][fix] Download HF model for all nodes. (#6824)
Signed-off-by: Yuxian Qiu <[email protected]> Signed-off-by: Wangshanshan <[email protected]>
1 parent 123f5cb commit 2d46dda

File tree

4 files changed

+54
-20
lines changed

4 files changed

+54
-20
lines changed

jenkins/L0_Test.groovy

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2184,19 +2184,18 @@ def launchTestJobs(pipeline, testFilter)
21842184
]
21852185
fullSet += SBSASlurmTestConfigs.keySet()
21862186

2187-
multiNodesSBSAConfigs = [
2187+
// multiNodesSBSAConfigs = [
21882188
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
21892189
// Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
21902190
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
21912191
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
21922192
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
21932193
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
2194-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2195-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2196-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2197-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2198-
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2],
2199-
]
2194+
// ]
2195+
multiNodesSBSAConfigs = [:]
2196+
multiNodesSBSAConfigs += (1..7).collectEntries { i ->
2197+
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]]
2198+
}
22002199
fullSet += multiNodesSBSAConfigs.keySet()
22012200

22022201
if (env.targetArch == AARCH64_TRIPLE) {

tensorrt_llm/llmapi/llm_utils.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
import weakref
88
from dataclasses import asdict, dataclass, field
99
from pathlib import Path
10-
from typing import Callable, List, Optional, Tuple, Union
10+
from typing import Any, Callable, List, Optional, Tuple, Union
1111

1212
import torch
1313
from tqdm import tqdm
1414

15-
from .._utils import (global_mpi_rank, mpi_barrier, mpi_broadcast, mpi_rank,
16-
release_gc)
15+
from .._utils import (global_mpi_rank, local_mpi_rank, mpi_barrier,
16+
mpi_broadcast, mpi_rank, release_gc)
1717
from ..auto_parallel import AutoParallelConfig
1818
# yapf: disable
1919
from ..bindings.executor import (BatchingType, CapacitySchedulerPolicy,
@@ -616,6 +616,17 @@ def workspace(self) -> Path:
616616
self._workspace, tempfile.TemporaryDirectory) else Path(
617617
self._workspace)
618618

619+
def _submit_to_all_workers(
620+
self,
621+
task: Callable[..., Any],
622+
*args,
623+
**kwargs,
624+
) -> List[Any]:
625+
if self.llm_args.parallel_config.is_multi_gpu:
626+
return self.mpi_session.submit_sync(task, *args, **kwargs)
627+
else:
628+
return [task(*args, **kwargs)]
629+
619630
def __call__(self) -> Tuple[Path, Union[Path, None]]:
620631

621632
if self.llm_args.model_format is _ModelFormatKind.TLLM_ENGINE:
@@ -636,9 +647,11 @@ def __call__(self) -> Tuple[Path, Union[Path, None]]:
636647
f'backend {self.llm_args.backend} is not supported.')
637648

638649
if self.model_loader.model_obj.is_hub_model:
639-
self._hf_model_dir = download_hf_model(
640-
self.model_loader.model_obj.model_name,
641-
self.llm_args.revision)
650+
hf_model_dirs = self._submit_to_all_workers(
651+
CachedModelLoader._node_download_hf_model,
652+
model=self.model_loader.model_obj.model_name,
653+
revision=self.llm_args.revision)
654+
self._hf_model_dir = hf_model_dirs[0]
642655
else:
643656
self._hf_model_dir = self.model_loader.model_obj.model_dir
644657

@@ -815,6 +828,17 @@ def build_task(engine_dir: Path):
815828

816829
return self.get_engine_dir()
817830

831+
@print_traceback_on_error
832+
@staticmethod
833+
def _node_download_hf_model(
834+
model: str,
835+
revision: Optional[str] = None,
836+
) -> Optional[Path]:
837+
if local_mpi_rank() == 0:
838+
return download_hf_model(model, revision)
839+
else:
840+
return None
841+
818842
@print_traceback_on_error
819843
@staticmethod
820844
def _node_build_task(

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2513,16 +2513,25 @@ def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
25132513
task.evaluate(llm)
25142514

25152515
@pytest.mark.parametrize(
2516-
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
2517-
[(1, 1, 1, False, True, True)],
2518-
ids=["latency"])
2516+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,is_cached",
2517+
[(1, 1, 1, False, True, True, True),
2518+
pytest.param(8,
2519+
1,
2520+
1,
2521+
False,
2522+
True,
2523+
True,
2524+
False,
2525+
marks=pytest.mark.skip_less_mpi_world_size(8))],
2526+
ids=["latency", "multi_gpus_no_cache"])
25192527
def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2520-
overlap_scheduler):
2528+
overlap_scheduler, is_cached):
25212529
pytorch_config = dict(
25222530
disable_overlap_scheduler=not overlap_scheduler,
25232531
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
25242532

2525-
with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
2533+
with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B"
2534+
if is_cached else "Qwen/Qwen3-8B",
25262535
tensor_parallel_size=tp_size,
25272536
pipeline_parallel_size=pp_size,
25282537
moe_expert_parallel_size=ep_size,

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,7 @@ l0_gb200_multi_nodes:
3333
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
3434
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
3535
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
36-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
37-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
36+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
37+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
38+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
39+
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)

0 commit comments

Comments
 (0)