Skip to content

Commit

Permalink
Merge branch 'main' into pr_fix_ln_sm_margin
Browse files Browse the repository at this point in the history
  • Loading branch information
guyueh1 authored Nov 19, 2024
2 parents 745dbb2 + 184b2e3 commit 1d97ff2
Show file tree
Hide file tree
Showing 19 changed files with 19 additions and 17 deletions.
2 changes: 1 addition & 1 deletion auto_configurator/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
base_results_dir: ${auto_configurator_path}/results
data_dir: ${launcher_scripts_path}/data

training_container: nvcr.io/nvidia/nemo:24.07
training_container: nvcr.io/nvidia/nemo:24.09
container_mounts:
- null

Expand Down
2 changes: 1 addition & 1 deletion auto_configurator/tests/config_tests/test_main_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_config(self):
base_results_dir: ${auto_configurator_path}/results
data_dir: ${launcher_scripts_path}/data
training_container: nvcr.io/nvidia/nemo:24.07
training_container: nvcr.io/nvidia/nemo:24.09
container_mounts:
- null
Expand Down
2 changes: 1 addition & 1 deletion csp_tools/aws/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

FROM nvcr.io/nvidia/nemo:24.07
FROM nvcr.io/nvidia/nemo:24.09

ARG NCCL_VERSION=2.18.5-1+cuda12.2
ARG EFA_INSTALLER_VERSION=1.28.0
Expand Down
2 changes: 1 addition & 1 deletion examples/peft/llama/a100/lora_4gpu_k8s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.
cluster=k8s_v2 \
cluster_type=k8s \
cluster.ib_interfaces=null \
container=nvcr.io/nvidia/nemo:24.07 \
container=nvcr.io/nvidia/nemo:24.09 \
stages=[peft] \
peft=${PEFT_CONFIG} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
Expand Down
2 changes: 1 addition & 1 deletion launcher_scripts/conf/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ data_dir: ${launcher_scripts_path}/data # Location to store and read the data.
base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
- null
container: nvcr.io/nvidia/nemo:24.07
container: nvcr.io/nvidia/nemo:24.09

wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line.
wandb_api_bcp_secret_key: null # For BCP clusters, read the W&B api key directly from the environment variable set as a secret from BCP. The value must match the name of the environment variable in BCP, such as WANDB_TOKEN.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ model:
# LLM configs
# use GPTModel from megatron.core
mcore_gpt: True
dist_ckpt_format: 'zarr'
moe_grouped_gemm: false
moe_token_dispatcher_type: alltoall
moe_aux_loss_coeff: 0.01
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ model:
# LLM configs
# use GPTModel from megatron.core
mcore_gpt: True
dist_ckpt_format: 'zarr'

moe_grouped_gemm: false
moe_token_dispatcher_type: alltoall
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import hydra
import numpy as np
import pytorch_lightning as pl
import lightning.pytorch as pl
import torch
import torch.utils.data as data
import webdataset as wds
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.get_rank import is_global_rank_zero
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from nemo.utils import logging
from nemo.utils.app_state import AppState
from nemo.utils.get_rank import is_global_rank_zero
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from nemo.utils.get_rank import is_global_rank_zero
from nemo.utils.model_utils import inject_model_parallel_rank
from omegaconf import OmegaConf, open_dict
from pytorch_lightning.trainer.trainer import Trainer
from lightning.pytorch.trainer.trainer import Trainer
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataloader import default_collate
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_config(self):
base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs.
container_mounts: # List of additional paths to mount to container. They will be mounted to same path.
- null
container: nvcr.io/nvidia/nemo:24.07
container: nvcr.io/nvidia/nemo:24.09
wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line.
wandb_api_bcp_secret_key: null # For BCP clusters, read the W&B api key directly from the environment variable set as a secret from BCP. The value must match the name of the environment variable in BCP, such as WANDB_TOKEN.
Expand Down

0 comments on commit 1d97ff2

Please sign in to comment.