diff --git a/pyproject.toml b/pyproject.toml index 29f3a1603b..1382333a45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,7 +116,7 @@ override-dependencies = [ ] [tool.uv.sources] -megatron-core = { path = "3rdparty/Megatron-LM/" } +megatron-core = { path = "3rdparty/Megatron-LM/", editable = true } nvidia-modelopt = { git = "https://github.com/NVIDIA/TensorRT-Model-Optimizer.git", rev = "0a4f0a8b933121f7af080261a0a5a7717f2c5d49" } nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "v0.4.1" } # Requires a source install to compile cupti for cuda13 diff --git a/src/megatron/bridge/training/initialize.py b/src/megatron/bridge/training/initialize.py index 8b00832f7d..88718f6c0e 100644 --- a/src/megatron/bridge/training/initialize.py +++ b/src/megatron/bridge/training/initialize.py @@ -14,6 +14,7 @@ import datetime import os +import time import warnings from typing import Callable, Optional @@ -21,6 +22,7 @@ import torch.distributed import torch.nn.functional as F from megatron.core import parallel_state, tensor_parallel +from megatron.core.datasets.utils import compile_helpers from megatron.core.fusions.fused_bias_dropout import bias_dropout_add_fused_train from megatron.core.fusions.fused_bias_gelu import bias_gelu from megatron.core.fusions.fused_bias_swiglu import bias_swiglu @@ -115,7 +117,7 @@ def initialize_megatron( init_rerun_state(rerun_state_machine_config) # torch.distributed initialization - return torch_dist_init( + result = torch_dist_init( model_config=model_config, dist_config=dist_config, rng_config=rng_config, @@ -128,6 +130,22 @@ def initialize_megatron( use_inprocess_restart=use_inprocess_restart, ) + # Compile dataset helpers after distributed initialization + if torch.distributed.is_initialized(): + if get_rank_safe() == 0: + start_time = time.time() + print("> compiling dataset index builder ...") + compile_helpers() + print( + ">>> done with dataset index builder. Compilation time: {:.3f} seconds".format( + time.time() - start_time + ), + flush=True, + ) + torch.distributed.barrier() + + return result + def torch_dist_init( model_config: GPTModelProvider | T5ModelProvider, diff --git a/uv.lock b/uv.lock index ad1132ddac..3ac3a0597b 100644 --- a/uv.lock +++ b/uv.lock @@ -3006,7 +3006,7 @@ requires-dist = [ { name = "flash-linear-attention" }, { name = "hydra-core", specifier = ">1.3,<=1.3.2" }, { name = "mamba-ssm" }, - { name = "megatron-core", extras = ["dev", "mlm"], directory = "3rdparty/Megatron-LM" }, + { name = "megatron-core", extras = ["dev", "mlm"], editable = "3rdparty/Megatron-LM" }, { name = "mlflow", specifier = ">=3.2.0" }, { name = "nemo-run", marker = "extra == 'recipes'", specifier = ">=0.5.0a0,<0.6.0" }, { name = "nvdlfw-inspect", marker = "extra == 'tensor-inspect'", specifier = "==0.2.1" }, @@ -3066,7 +3066,7 @@ test = [ [[package]] name = "megatron-core" -source = { directory = "3rdparty/Megatron-LM" } +source = { editable = "3rdparty/Megatron-LM" } dependencies = [ { name = "numpy" }, { name = "packaging" },