Skip to content

Commit

Permalink
mistral-nemo-12b from llama_8b
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
akoumpa committed Sep 30, 2024
1 parent b7c0bfa commit e46e205
Showing 1 changed file with 108 additions and 22 deletions.
130 changes: 108 additions & 22 deletions nemo/collections/llm/recipes/mistral_nemo_12b.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.


from typing import Optional
from typing import Callable, Optional

import nemo_run as run
import pytorch_lightning as pl
Expand All @@ -24,15 +24,17 @@
from nemo import lightning as nl
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.squad import SquadDataModule
from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

NAME = "mistral_nemo_base_12b"


@run.cli.factory(name=NAME)
def model() -> run.Config[pl.LightningModule]:
"""
Expand Down Expand Up @@ -61,7 +63,7 @@ def trainer(
sequence_parallelism: bool = False,
num_nodes: int = 1,
num_gpus_per_node: int = 8,
max_steps: int = 100,
max_steps: int = 1168251,
callbacks: Optional[list[run.Config[Callback]]] = None,
) -> run.Config[nl.Trainer]:
"""
Expand Down Expand Up @@ -91,6 +93,10 @@ def trainer(
Python API usage:
>>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
>>> print(trainer_config)
Note:
For more information on distributed training strategies, refer to the
NeMo documentation on multi-GPU and multi-node training.
"""
strategy = run.Config(
nl.MegatronStrategy,
Expand All @@ -101,7 +107,6 @@ def trainer(
context_parallel_size=context_parallelism,
sequence_parallel=sequence_parallelism,
gradient_as_bucket_view=True,
ckpt_include_optimizer=True,
ckpt_async_save=True,
ckpt_parallel_load=True,
ddp=run.Config(
Expand All @@ -119,7 +124,6 @@ def trainer(
accumulate_grad_batches=1,
callbacks=callbacks,
devices=num_gpus_per_node,
gradient_clip_val=1.0,
limit_test_batches=50,
limit_val_batches=32,
log_every_n_steps=10,
Expand Down Expand Up @@ -157,37 +161,79 @@ def pretrain_recipe(
Examples:
CLI usage:
$ nemo llm pretrain --factory mistral_nemo_base_12b
$ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_mistral_pretrain')"
$ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')"
Python API usage:
>>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2)
>>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2)
>>> print(recipe)
Note:
For more details on pre-training LLMs with NeMo, see the pre-training
guide in the `examples/llm/pretrain/` directory.
"""
return run.Partial(
fn,
model=model(),
trainer=trainer(
tensor_parallelism=2,
pipeline_parallelism=1,
pipeline_parallelism_type=None,
virtual_pipeline_parallelism=None,
context_parallelism=2,
sequence_parallelism=False,
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
resume=default_resume(),
)


@run.cli.factory(name=NAME + "_hf")
def hf_resume() -> run.Config[nl.AutoResume]:
@run.cli.factory(target=pretrain, name=NAME + "_optimized")
def pretrain_recipe_performance(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
fn: Callable = pretrain,
) -> run.Partial:
"""
Configure automatic resumption from a Hugging Face checkpoint for Mistral-Nemo-Base-12B model.
Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model.
This recipe enables performance optimizations that may not be suitable for all use cases.
It builds upon the standard pre-training recipe and adds additional performance enhancements.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the pre-training run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
fn (Callable): The pre-training function to use.
Returns:
run.Partial: Partial configuration for performance-optimized pre-training.
Examples:
$ nemo llm pretrain --factory mistral_nemo_base_12b_optimized
Python API usage:
>>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4)
>>> print(recipe)
Note:
Use this recipe with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)

recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=False,
)
)
return recipe


def hf_resume() -> run.Config[nl.AutoResume]:
"""Configure automatic resumption from a Hugging Face checkpoint.
This function sets up the configuration to resume training from a pre-trained
Hugging Face model checkpoint.
Expand All @@ -196,11 +242,51 @@ def hf_resume() -> run.Config[nl.AutoResume]:
Returns:
run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
Note:
This is particularly useful for fine-tuning scenarios where you want to
start from the pre-trained Mistral-Nemo-Base-12B model.
"""
return run.Config(
nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-Nemo-Base-2407")
nl.AutoResume,
restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-Nemo-Base-2407"),
)


@run.cli.factory(target=finetune, name=NAME)
def finetune_recipe(
dir: Optional[str] = None,
name: str = "default",
num_nodes: int = 1,
num_gpus_per_node: int = 8,
) -> run.Partial:
"""
Create a fine-tuning recipe for Mistral-Nemo-Base-12B model.
This function sets up a complete configuration for fine-tuning, including
model, trainer, data, logging, optimization, and resumption settings.
It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
Args:
dir (Optional[str]): Directory for saving logs and checkpoints.
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
Returns:
run.Partial: Partial configuration for fine-tuning.
Examples:
CLI usage:
$ nemo llm finetune --factory mistral_nemo_base_12b
Python API usage:
>>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2)
>>> print(recipe)
Note:
This recipe uses the SQuAD dataset for fine-tuning. For more information
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""
recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
recipe.resume = hf_resume()
recipe.peft = run.Config(LoRA)
recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
return recipe

0 comments on commit e46e205

Please sign in to comment.