NVIDIA · cuichenx · Sep 28, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 17, 2023
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -52,6 +52,7 @@
     'attr',  # attrdict in requirements, attr in import
     'torchmetrics',  # inherited from PTL
     'lightning_utilities',  # inherited from PTL
+    'lightning_fabric',
     'apex',
     'megatron.core',
     'transformer_engine',

diff --git a/docs/source/nlp/api.rst b/docs/source/nlp/api.rst
@@ -81,7 +81,6 @@ Modules
 .. autoclass:: nemo.collections.nlp.modules.common.megatron.module.Float16Module
     :show-inheritance: 
 
-
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron.gpt_model.GPTModel
     :show-inheritance: 
     :no-members:
@@ -140,11 +139,22 @@ Datasets
 .. autoclass:: nemo.collections.nlp.data.language_modeling.megatron.ul2_dataset.UL2Dataset
     :show-inheritance: 
 
+
+Adapter Mixin Class
+-------------------------
+
+.. autoclass:: nemo.collections.nlp.parts.mixins.nlp_adapter_mixins.NLPAdapterModelMixin
+    :show-inheritance:
+    :members: add_adapter, load_adapters, merge_cfg_with, merge_inference_cfg
+    :exclude-members: first_stage_of_pipeline, tie_weights, get_peft_state_dict, state_dict, sharded_state_dict, load_state_dict, on_load_checkpoint
+    :member-order: bysource
+
+
 Exportable Model Classes
 -------------------------
 
 .. autoclass:: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTExportableModel
-    :show-inheritance: 
+    :show-inheritance:
 
 .. toctree::
    :maxdepth: 1

diff --git a/docs/source/nlp/nemo_megatron/intro.rst b/docs/source/nlp/nemo_megatron/intro.rst
@@ -25,6 +25,7 @@ team at NVIDIA. NeMo Megatron supports several types of models:
    prompt_learning
    retro/retro_model
    hiddens/hiddens_module
+   peft/landing_page
 
 
 References

diff --git a/docs/source/nlp/nemo_megatron/peft/landing_page.rst b/docs/source/nlp/nemo_megatron/peft/landing_page.rst
@@ -0,0 +1,35 @@
+Parameter-Efficient Fine-Tuning (PEFT)
+======================================
+
+PEFT is a popular technique used to efficiently finetune large language
+models for use in various downstream tasks. When finetuning with PEFT,
+the base model weights are frozen, and a few trainable adapter modules
+are injected into the model, resulting in a very small number (<< 1%) of
+trainble weights. With carefully chosen adapter modules and injection
+points, PEFT achieves comparable performance to full finetuning at a
+fraction of the computational and storage costs.
+
+NeMo supports four PEFT methods which can be used with various
+transformer-based models.
+
+==================== ===== ===== ========= ==
+\                    GPT 3 NvGPT LLaMa 1/2 T5
+==================== ===== ===== ========= ==
+Adapters (Canonical) ✅    ✅    ✅        ✅
+LoRA                 ✅    ✅    ✅        ✅
+IA3                  ✅    ✅    ✅        ✅
+P-Tuning             ✅    ✅    ✅        ✅
+==================== ===== ===== ========= ==
+
+Learn more about PEFT in NeMo with the :ref:`peftquickstart` which provides an overview on how PEFT works
+in NeMo. Read about the supported PEFT methods
+`here <supported_methods.html>`__. For a practical example, take a look at
+the `Step-by-step Guide <https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/lora.ipynb>`__.
+
+The API guide can be found `here <../../api.html#adapter-mixin-class>`__
+
+.. toctree::
+   :maxdepth: 1
+
+   quick_start
+   supported_methods
diff --git a/docs/source/nlp/nemo_megatron/peft/quick_start.rst b/docs/source/nlp/nemo_megatron/peft/quick_start.rst
@@ -0,0 +1,90 @@
+.. _peftquickstart:
+
+
+Quick Start Guide
+=================
+
+The quick start guide provides an overview of a PEFT workflow in NeMo.
+
+Terminology: PEFT vs Adapter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This tutorial uses "PEFT" to describe the overall parameter efficient
+finetuning method, and "adapter" to describe the additional module
+injected to a frozen base model. Each PEFT model can use one or more
+types of adapters.
+
+One of the PEFT methods is sometimes referred to as "adapters", because
+it was one of the first proposed usage of adapter modules for NLP. This
+PEFT method will be called the "canonical" adapters to distinguish the
+two usages.
+
+How PEFT work in NeMo models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Each PEFT method has one or more types of adapters that need to be
+injected into the base model. In NeMo models, the adapter logic and
+adapter weights are already built into the submodules, but they are
+disabled by default for ordinary training and fine-tuning.
+
+When doing PEFT, the adapter logic path can be enabled when
+``model.add_adapter(peft_cfg)`` is called. In this function, the model
+scans through each adapter applicable to the current PEFT method with
+each of its submodules in order to find adapter logic paths that can be
+enabled. Then, the base models weights are frozen, while newly added
+adapter weights are unfrozen and allowed to be updated during
+fine-tuning, hence achieving efficiency in the number of parameters
+finetuned.
+
+PEFT config classes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Each PEFT method is specified by a ``PEFTConfig`` class which stores the
+types of adapters applicable to the PEFT method, as well as
+hyperparameters required to initialize these adapter modules. These four
+PEFT methods are currently supported:
+
+1. Adapters (canonical): ``CanonicalAdaptersPEFTConfig``
+2. LoRA: ``LoraPEFTConfig``
+3. IA3: ``IA3PEFTConfig``
+4. P-Tuning: ``PtuningPEFTConfig``
+
+These config classes make experimenting with different adapters as easy
+as changing the config class.
+
+Moreover, it is possible to use a combination of the PEFT methods in
+NeMo since they are orthogonal to each other. This can be easily done by
+passing in a list of ``PEFTConfig`` objects to ``add_adapter`` instead
+of a single one. For example, a common workflow is to combine P-Tuning
+and Adapter, and this can be achieved with
+``model.add_adapter([PtuningPEFTConfig(model_cfg), CanonicalAdaptersPEFTConfig(model_cfg)])``
+
+Base model classes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+PEFT in NeMo is built with a mix-in class that does not belong to any
+model in particular. This means that the same interface is available to
+different NeMo models. Currently, NeMo supports PEFT for GPT-style
+models such as GPT 3, NvGPT, LLaMa 1/2 (``MegatronGPTSFTModel``), as
+well as T5 (``MegatronT5SFTModel``).
+
+Full finetuning vs PEFT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+You can switch between full fine-tuning and PEFT by removing calls to
+``add_adapter`` and ``load_adapter``.
+
+The code snippet below illustrates the core API of full fine-tuning and
+PEFT.
+
+.. code:: diff
+
+   trainer = MegatronTrainerBuilder(config).create_trainer()
+   model_cfg = MegatronGPTSFTModel.merge_cfg_with(config.model.restore_from_path, config)
+
+   model = MegatronGPTSFTModel.restore_from(restore_path, model_cfg, trainer) # restore from pretrained ckpt
+   + peft_cfg = LoRAPEFTConfig(model_cfg)
+   + model.add_adapter(peft_cfg) 
+   trainer.fit(model)  # saves adapter weights only
+
+   # Restore from base then load adapter API 
+   model = MegatronGPTSFTModel.restore_from(restore_path, trainer, model_cfg)
+   + model.load_adapters(adapter_save_path, peft_cfg)
+   model.freeze()
+   trainer.predict(model)
diff --git a/docs/source/nlp/nemo_megatron/peft/supported_methods.rst b/docs/source/nlp/nemo_megatron/peft/supported_methods.rst
@@ -0,0 +1,71 @@
+
+
+Supported PEFT methods
+----------------------
+
+NeMo supports the following PFET tuning methods
+
+1. **Adapters (Canonical)**: `Parameter-Efficient Transfer Learning for
+   NLP <http://arxiv.org/abs/1902.00751>`__
+
+   -  Adapters (Houlsby setup) is one of the first PEFT methods applied
+      to NLP. Adapter tuning is more efficient than full fine-tuning
+      because the base model weights are frozen, while only a small
+      number of adapter module weights are updated. In this method, two
+      linear layers with a bottleneck and a non-linear activation are
+      inserted into each transformer layer via a residual connection. In
+      each case, the output linear layer is initialized to 0 to ensure
+      that an untrained adapter does not affect the normal forward pass
+      of the transformer layer.
+
+2. **LoRA**: `LoRA: Low-Rank Adaptation of Large Language
+   Models <http://arxiv.org/abs/2106.09685>`__
+
+   -  LoRA makes fine-tuning efficient by representing weight updates
+      with two low rank decomposition matrices. The original model
+      weights remain frozen, while the low rank decomposition matrices
+      are updated to adapt to the new data , so the number of trainable
+      parameters is kept low. In contrast with adapters, the original
+      model weights and adapted weights can be combined during
+      inference, avoiding any architectural change or additional latency
+      in the model at inference time.
+   -  The matrix decomposition operation can be applied to any linear
+      layer, but in practice, it is only applied to the K, Q, V
+      projection matrices (sometimes just applied to the Q,V layers).
+      Since NeMo's attention implementation fuses KQV into a single
+      projection, our LoRA implementation learns a single Low-Rank
+      projection for KQV in a combined fashion.
+
+3. **IA3**: `Few-Shot Parameter-Efficient Fine-Tuning is Better and
+   Cheaper than In-Context Learning <http://arxiv.org/abs/2205.05638>`__
+
+   -  IA3 makes fine-tuning efficient by rescaling activations with
+      learned vectors. The rescaling layers are injected in the
+      attention (for key and value) and feedforward modules in the base
+      model. Similar to other PEFT methods, only the rescaling vectors
+      are updated during fine-tuning to adapt to the new data so the
+      number of updated parameters is low. However, since rescaling
+      vectors are much smaller than low rank matrices (LoRA) and
+      bottleneck layers (Adapters), IA3 cuts down the number of
+      trainable parameters further by an order of magnitude. The
+      learning rescaling vectors can also be merged with the base
+      weights, leading to no architectural change and no additional
+      latency at inference time.
+
+4. **P-Tuning**: `GPT Understands,
+   Too <https://arxiv.org/abs/2103.10385>`__
+
+   -  P-tuning is an example of the prompt learning family of methods,
+      in which trainable virtual tokens are inserted into the model
+      input prompt to induce it to perform a task. Virtual tokens (also
+      called "continuous" or "soft" tokens) are embeddings that have no
+      concrete mapping to strings or characters within the model’s
+      vocabulary. They are simply 1D vectors that match the
+      dimensionality of real tokens which make up the model's
+      vocabulary.
+   -  In p-tuning, an intermediate LSTM or MLP model is used to generate
+      virtual token embeddings. We refer to this intermediate model as
+      our ``prompt_encoder``. The prompt encoder parameters are randomly
+      initialized at the start of p-tuning. All base model parameters
+      are frozen, and only the prompt encoder weights are updated at
+      each training step.
diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -18,9 +18,9 @@
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.plugins.precision import MixedPrecisionPlugin
 
-from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
+from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
 from nemo.collections.nlp.parts.nlp_overrides import GradScaler, MegatronHalfPrecisionPlugin, NLPDDPStrategy
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
@@ -122,13 +122,13 @@ def main(cfg) -> None:
             model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config)
     else:
         if cfg.model.restore_from_path:
-            t5_cfg = MegatronT5FinetuneModel.restore_from(
+            t5_cfg = MegatronT5SFTModel.restore_from(
                 restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_nemo(MegatronT5SFTModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)
+            model = load_from_checkpoint_dir(MegatronT5SFTModel, cfg, trainer, modify_confg_fn=_modify_config)
 
     model.freeze()
     trainer.validate(model)

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -21,9 +21,9 @@
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
-from nemo.collections.nlp.models.language_modeling.megatron_finetune_model import MegatronT5FinetuneModel
 from nemo.collections.nlp.models.language_modeling.megatron_glue_model import MegatronT5GLUEModel
 from nemo.collections.nlp.models.language_modeling.megatron_t0_model import MegatronT0Model
+from nemo.collections.nlp.models.language_modeling.megatron_t5_sft_model import MegatronT5SFTModel
 from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
@@ -206,13 +206,13 @@ def main(cfg) -> None:
             model = load_from_checkpoint_dir(MegatronT0Model, cfg, trainer, modify_confg_fn=_modify_config)
     else:
         if cfg.model.restore_from_path:
-            t5_cfg = MegatronT5FinetuneModel.restore_from(
+            t5_cfg = MegatronT5SFTModel.restore_from(
                 restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
             )
-            model = load_from_nemo(MegatronT5FinetuneModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
+            model = load_from_nemo(MegatronT5SFTModel, cfg, trainer, t5_cfg, modify_confg_fn=_modify_config)
         else:
             validate_checkpoint_loading_args(cfg.model.pretrained_checkpoint)
-            model = load_from_checkpoint_dir(MegatronT5FinetuneModel, cfg, trainer, modify_confg_fn=_modify_config)
+            model = load_from_checkpoint_dir(MegatronT5SFTModel, cfg, trainer, modify_confg_fn=_modify_config)
 
     trainer.fit(model)
     trainer.validate(model)