mistral-nemo-12b from llama_8b

Signed-off-by: Alexandros Koumparoulis <[email protected]>
NVIDIA · Sep 30, 2024 · e46e205 · e46e205
1 parent b7c0bfa
commit e46e205
Showing 1 changed file with 108 additions and 22 deletions.
diff --git a/nemo/collections/llm/recipes/mistral_nemo_12b.py b/nemo/collections/llm/recipes/mistral_nemo_12b.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Optional
+from typing import Callable, Optional
 
 import nemo_run as run
 import pytorch_lightning as pl
@@ -24,15 +24,17 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model.mistral import MistralModel, MistralNeMoConfig12B
+from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed
+from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.utils.exp_manager import TimingCallback
 
 NAME = "mistral_nemo_base_12b"
 
-
 @run.cli.factory(name=NAME)
 def model() -> run.Config[pl.LightningModule]:
     """
@@ -61,7 +63,7 @@ def trainer(
     sequence_parallelism: bool = False,
     num_nodes: int = 1,
     num_gpus_per_node: int = 8,
-    max_steps: int = 100,
+    max_steps: int = 1168251,
     callbacks: Optional[list[run.Config[Callback]]] = None,
 ) -> run.Config[nl.Trainer]:
     """
@@ -91,6 +93,10 @@ def trainer(
         Python API usage:
             >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
             >>> print(trainer_config)
+
+    Note:
+        For more information on distributed training strategies, refer to the
+        NeMo documentation on multi-GPU and multi-node training.
     """
     strategy = run.Config(
         nl.MegatronStrategy,
@@ -101,7 +107,6 @@ def trainer(
         context_parallel_size=context_parallelism,
         sequence_parallel=sequence_parallelism,
         gradient_as_bucket_view=True,
-        ckpt_include_optimizer=True,
         ckpt_async_save=True,
         ckpt_parallel_load=True,
         ddp=run.Config(
@@ -119,7 +124,6 @@ def trainer(
         accumulate_grad_batches=1,
         callbacks=callbacks,
         devices=num_gpus_per_node,
-        gradient_clip_val=1.0,
         limit_test_batches=50,
         limit_val_batches=32,
         log_every_n_steps=10,
@@ -157,37 +161,79 @@ def pretrain_recipe(
     Examples:
         CLI usage:
             $ nemo llm pretrain --factory mistral_nemo_base_12b
-            $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_mistral_pretrain')"
+            $ nemo llm pretrain --factory "mistral_nemo_base_12b(num_nodes=2, name='my_pretrain')"
 
         Python API usage:
-            >>> recipe = pretrain_recipe(name="mistral_pretrain", num_nodes=2)
+            >>> recipe = pretrain_recipe(name="mistral_nemo_base_12b", num_nodes=2)
             >>> print(recipe)
+
+    Note:
+        For more details on pre-training LLMs with NeMo, see the pre-training
+        guide in the `examples/llm/pretrain/` directory.
     """
     return run.Partial(
         fn,
         model=model(),
         trainer=trainer(
-            tensor_parallelism=2,
-            pipeline_parallelism=1,
-            pipeline_parallelism_type=None,
-            virtual_pipeline_parallelism=None,
-            context_parallelism=2,
-            sequence_parallelism=False,
             num_nodes=num_nodes,
             num_gpus_per_node=num_gpus_per_node,
             callbacks=[run.Config(TimingCallback)],
         ),
-        data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
+        data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
         log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
         optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
         resume=default_resume(),
     )
 
 
-@run.cli.factory(name=NAME + "_hf")
-def hf_resume() -> run.Config[nl.AutoResume]:
+@run.cli.factory(target=pretrain, name=NAME + "_optimized")
+def pretrain_recipe_performance(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+    fn: Callable = pretrain,
+) -> run.Partial:
     """
-    Configure automatic resumption from a Hugging Face checkpoint for Mistral-Nemo-Base-12B model.
+    Create a performance-optimized pre-training recipe for Mistral-Nemo-Base-12B model.
+
+    This recipe enables performance optimizations that may not be suitable for all use cases.
+    It builds upon the standard pre-training recipe and adds additional performance enhancements.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the pre-training run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+        fn (Callable): The pre-training function to use.
+
+    Returns:
+        run.Partial: Partial configuration for performance-optimized pre-training.
+
+    Examples:
+            $ nemo llm pretrain --factory mistral_nemo_base_12b_optimized
+
+        Python API usage:
+            >>> recipe = pretrain_recipe_performance(name="mistral_nemo_base_12b_perf", num_nodes=4)
+            >>> print(recipe)
+
+    Note:
+        Use this recipe with caution and only when you need maximum performance.
+        It may not be suitable for all hardware configurations or use cases.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn)
+
+    recipe.trainer.callbacks.append(
+        run.Config(
+            MegatronCommOverlapCallback,
+            tp_comm_overlap=False,
+        )
+    )
+    return recipe
+
+
+def hf_resume() -> run.Config[nl.AutoResume]:
+    """Configure automatic resumption from a Hugging Face checkpoint.
 
     This function sets up the configuration to resume training from a pre-trained
     Hugging Face model checkpoint.
@@ -196,11 +242,51 @@ def hf_resume() -> run.Config[nl.AutoResume]:
 
     Returns:
         run.Config[nl.AutoResume]: Configuration for resuming from HuggingFace checkpoint.
-
-    Note:
-        This is particularly useful for fine-tuning scenarios where you want to
-        start from the pre-trained Mistral-Nemo-Base-12B model.
     """
     return run.Config(
-        nl.AutoResume, restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-Nemo-Base-2407")
+        nl.AutoResume,
+        restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mistral-Nemo-Base-2407"),
     )
+
+
+@run.cli.factory(target=finetune, name=NAME)
+def finetune_recipe(
+    dir: Optional[str] = None,
+    name: str = "default",
+    num_nodes: int = 1,
+    num_gpus_per_node: int = 8,
+) -> run.Partial:
+    """
+    Create a fine-tuning recipe for Mistral-Nemo-Base-12B model.
+
+    This function sets up a complete configuration for fine-tuning, including
+    model, trainer, data, logging, optimization, and resumption settings.
+    It uses LoRA (Low-Rank Adaptation) for efficient fine-tuning.
+
+    Args:
+        dir (Optional[str]): Directory for saving logs and checkpoints.
+        name (str): Name of the fine-tuning run.
+        num_nodes (int): Number of compute nodes to use.
+        num_gpus_per_node (int): Number of GPUs per node.
+
+    Returns:
+        run.Partial: Partial configuration for fine-tuning.
+
+    Examples:
+        CLI usage:
+            $ nemo llm finetune --factory mistral_nemo_base_12b
+
+        Python API usage:
+            >>> recipe = finetune_recipe(name="mistral_nemo_base_12b_finetune", num_nodes=2)
+            >>> print(recipe)
+
+    Note:
+        This recipe uses the SQuAD dataset for fine-tuning. For more information
+        on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
+        `examples/llm/finetune/` directory.
+    """
+    recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune)
+    recipe.resume = hf_resume()
+    recipe.peft = run.Config(LoRA)
+    recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1)
+    return recipe