Merge branch 'main' into jiemingz/export_refit_trtllm_v10

NVIDIA · Jul 2, 2024 · c155d67 · c155d67
2 parents 992252e + 5a9000f
commit c155d67
Show file tree

Hide file tree

Showing 143 changed files with 8,868 additions and 6,604 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -34,6 +34,13 @@ TTS:
 - tests/collections/tts/**
 - tests/collections/common/tokenizers/text_to_speech/**
 
+Audio:
+- nemo/collections/audio/**/*
+- examples/audio/**/*
+- tutorials/audio/**/*
+- docs/source/audio/**/*
+- tests/collections/audio/**
+
 core:
 - nemo/core/**/*
 - tests/core/**

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -213,7 +213,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           quantization.algorithm=null \
           export.save_path=/home/TestData/nlp/megatron_llama/ci_baseline
@@ -226,7 +226,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
           model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
           model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
@@ -245,7 +245,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
         model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
         quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
         quantization.algorithm=int8_sq \
@@ -274,7 +274,7 @@ jobs:
   #      - name: Checkout repository
   #        uses: actions/checkout@v4
   #      - run: |
-  #          python examples/nlp/language_modeling/megatron_gpt_quantization.py \
+  #          python examples/nlp/language_modeling/megatron_gpt_ptq.py \
   #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
   #          model.tensor_model_parallel_size=1 \
   #          trainer.devices=1 \
@@ -288,6 +288,45 @@ jobs:
         #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
         #  if: "failure()"
 
+  L2_QAT_Llama2_INT4:
+     needs: [cicd-test-container-setup]
+     runs-on: self-hosted-azure
+     timeout-minutes: 10
+     container:
+       image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
+       options:
+         # --user 0:128
+         --device=/dev/nvidia0
+         --gpus all
+         --shm-size=8g
+         --env TRANSFORMERS_OFFLINE=0
+         --env HYDRA_FULL_ERROR=1
+         --volume /mnt/datadrive/TestData:/home/TestData
+     steps:
+         - name: Checkout repository
+           uses: actions/checkout@v4
+         - run: |
+            python examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
+            quantization.algorithm=int4 \
+            quantization.num_calib_size=8 \
+            trainer.devices=1 \
+            trainer.num_nodes=1 \
+            trainer.max_steps=4 \
+            trainer.val_check_interval=4 \
+            +trainer.limit_val_batches=2 \
+            exp_manager.explicit_log_dir=llama2_qat_results \
+            model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+            model.tensor_model_parallel_size=1 \
+            model.pipeline_model_parallel_size=1 \
+            model.global_batch_size=2 \
+            model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
+            model.data.train_ds.concat_sampling_probabilities=[1.0] \
+            model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl]
+
+            rm -rf llama2_qat_results
+         - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+           if: "failure()"
+
   # L2: ASR dev run
   ASR_dev_run_Speech_to_Text:
     needs: [cicd-test-container-setup]

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -33,7 +33,7 @@ WORKDIR /workspace
 
 # Install NeMo requirements
 ARG TE_TAG=bfe21c3d68b0a9951e5716fb520045db53419c5e
-ARG MODELOPT_VERSION=0.11.0
+ARG MODELOPT_VERSION=0.13.0
 ARG MCORE_TAG=02871b4df8c69fac687ab6676c4246e936ce92d0
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \

diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
@@ -248,9 +248,76 @@ You might also want to adjust the callback parameters:
 
 Straggler detection might involve inter-rank synchronization, and should be invoked with reasonable frequency (e.g. every few minutes).
 
-.. _nemo_multirun-label:
+Fault Tolerance
+---------------
+
+.. _exp_manager_fault_tolerance_support-label:
+
+.. note::
+    Fault Tolerance feature is included in the optional NeMo resiliency package.
+
+When training DNN models, faults may occur, hindering the progress of the entire training process. 
+This is particularly common in distributed, multi-node training scenarios, with many nodes and GPUs involved. 
+
+NeMo incorporates a fault tolerance mechanism to detect training halts. 
+In response, it can terminate a hung workload and, if requested, restart it from the last checkpoint.
+
+Fault tolerance ("FT") relies on a special launcher (``ft_launcher``), which is a modified ``torchrun``. 
+The FT launcher runs background processes called rank monitors. **You need to use ft_launcher to start 
+your workload if you are using FT**. I.e., `NeMo-Framework-Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_  
+can be used to generate SLURM batch scripts with FT support. 
 
+Each training process (rank) sends `heartbeats` to its monitor during training and validation steps.
+If a rank monitor stops receiving `heartbeats`, a training failure is detected.
 
+Fault detection is implemented in the ``FaultToleranceCallback`` and is disabled by default. 
+To enable it, add a ``create_fault_tolerance_callback: True`` option under ``exp_manager`` in the 
+config YAML file. Additionally, you can customize FT parameters by adding ``fault_tolerance`` section:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_fault_tolerance_callback: True
+        fault_tolerance:
+            initial_rank_heartbeat_timeout: 600  # wait for 10 minutes for the initial heartbeat
+            rank_heartbeat_timeout: 300  # wait for 5 minutes for subsequent heartbeats
+            calculate_timeouts: True # estimate more accurate timeouts based on observed intervals
+
+Timeouts for fault detection need to be adjusted for a given workload:
+    * ``initial_rank_heartbeat_timeout`` should be long enough to allow for workload initialization.
+    * ``rank_heartbeat_timeout`` should be at least as long as the longest possible interval between steps. 
+
+**Importantly, `heartbeats` are not sent during checkpoint loading and saving**, so time for 
+checkpointing related operations should be taken into account.
+
+If ``calculate_timeouts: True`` timeouts will be automatically estimated based on observed intervals. 
+Estimated timeouts take precedence over timeouts defined in the config file. **Timeouts are estimated after 
+checkpoint loading and saving was observed**. For example, in multi-part training started from scratch, 
+estimated timeouts won't be available during the first run. Estimated timeouts are stored in the checkpoint. 
+
+``max_subsequent_job_failures`` allows for the automatic continuation of training on a SLURM cluster. 
+This feature requires SLURM job to be scheduled with ``NeMo-Framework-Launcher``. If ``max_subsequent_job_failures`` 
+value is `>0` continuation job is prescheduled. It will continue  the work until ``max_subsequent_job_failures`` 
+subsequent jobs failed (SLURM job exit code is `!= 0`) or the training is completed successfully 
+("end of training" marker file is produced by the ``FaultToleranceCallback``, i.e. due to iters or time limit reached).
+
+All FT configuration items summary:
+    * ``workload_check_interval`` (float, default=5.0) Periodic workload check interval [seconds] in the workload monitor.
+    * ``initial_rank_heartbeat_timeout`` (Optional[float], default=60.0 * 60.0) Timeout for the first heartbeat from a rank. 
+    * ``rank_heartbeat_timeout`` (Optional[float], default=45.0 * 60.0) Timeout for subsequent heartbeats from a rank. 
+    * ``calculate_timeouts`` (bool, default=True) Try to calculate ``rank_heartbeat_timeout`` and ``initial_rank_heartbeat_timeout`` 
+      based on the observed heartbeat intervals.
+    * ``rank_termination_signal`` (signal.Signals, default=signal.SIGKILL) Signal used to terminate the rank when failure is detected.
+    * ``log_level`` (str, default='INFO') Log level for the FT client and server(rank monitor).
+    * ``max_rank_restarts`` (int, default=0) Used by FT launcher. Max number of restarts for a rank. 
+      If ``>0`` ranks will be restarted on existing nodes in case of a failure.
+    * ``max_subsequent_job_failures`` (int, default=0) Used by FT launcher. How many subsequent job failures are allowed until stopping autoresuming. 
+      ``0`` means do not autoresume.
+    * ``additional_ft_launcher_args`` (str, default='') Additional FT launcher params (for advanced use).
+
+
+.. _nemo_multirun-label:
 Hydra Multi-Run with NeMo
 -------------------------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ NVIDIA NeMo Framework is an end-to-end, cloud-native framework designed to build
 - Flash Attention
 - Activation Recomputation
 - Positional Embeddings and Positional Interpolation
-- Post-Training Quantization (PTQ) with ModelOpt
+- Post-Training Quantization (PTQ) and Quantization Aware Training (QAT) with `TensorRT Model Optimizer <https://github.com/NVIDIA/TensorRT-Model-Optimizer>`_
 - Sequence Packing
 
 `NVIDIA NeMo Framework <https://github.com/NVIDIA/NeMo>`_ has separate collections for:

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
@@ -55,6 +55,10 @@ Table below presents verified model support matrix for popular LLM architectures
      - ✅
      - ✅
      - ✅
+   * - `Nemotron-4 340b <https://huggingface.co/nvidia/Nemotron-4-340B-Base>`_  (Base, Instruct, Reward)
+     - ✅
+     - ✅
+     - ✅
    * - StarCoder 2
      - ✅
      - ✅
@@ -67,14 +71,14 @@ Table below presents verified model support matrix for popular LLM architectures
 
 Example
 ^^^^^^^
-The example below shows how to quantize the Llama2 70b model into FP8 precision, using tensor parallelism of 8 on a single DGX H100 node. The quantized model is designed for serving using 2 GPUs specified with the ``export.inference_tensor_parallel`` parameter.
+The example below shows how to quantize the Llama3 70b model into FP8 precision, using tensor parallelism of 8 on a single DGX H100 node. The quantized model is designed for serving using 2 GPUs specified with the ``export.inference_tensor_parallel`` parameter.
 
 The script must be launched correctly with the number of processes equal to tensor parallelism. This is achieved with the ``torchrun`` command below:
 
 .. code-block:: bash
 
-    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_gpt_quantization.py \
-        model.restore_from_path=llama2-70b-base-bf16.nemo \
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/megatron_gpt_ptq.py \
+        model.restore_from_path=llama3-70b-base-bf16.nemo \
         model.tensor_model_parallel_size=8 \
         model.pipeline_model_parallel_size=1 \
         trainer.num_nodes=1 \
@@ -83,15 +87,15 @@ The script must be launched correctly with the number of processes equal to tens
         quantization.algorithm=fp8 \
         export.decoder_type=llama \
         export.inference_tensor_parallel=2 \
-        export.save_path=llama2-70b-base-fp8-qnemo
-
+        export.save_path=llama3-70b-base-fp8-qnemo
 
+For large models, the command can be used in multi-node setting. For example, this can be done with `NeMo Framework Launcher <https://github.com/NVIDIA/NeMo-Framework-Launcher>`_ using Slurm.
 
 The output directory stores the following files:
 
 .. code-block:: bash
 
-    llama2-70b-base-fp8-qnemo/
+    llama3-70b-base-fp8-qnemo/
     ├── config.json
     ├── rank0.safetensors
     ├── rank1.safetensors
@@ -108,7 +112,7 @@ The TensorRT-LLM engine can be conveniently built and run using ``TensorRTLLM``
 
     trt_llm_exporter = TensorRTLLM(model_dir="/path/to/trt_llm_engine_folder")
     trt_llm_exporter.export(
-        nemo_checkpoint_path="llama2-70b-base-fp8-qnemo",
+        nemo_checkpoint_path="llama3-70b-base-fp8-qnemo",
         model_type="llama",
     )
     trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
@@ -119,7 +123,7 @@ Alternatively, it can also be built directly using ``trtllm-build`` command, see
 .. code-block:: bash
 
     trtllm-build \
-        --checkpoint_dir llama2-70b-base-fp8-qnemo \
+        --checkpoint_dir llama3-70b-base-fp8-qnemo \
         --output_dir /path/to/trt_llm_engine_folder \
         --max_batch_size 8 \
         --max_input_len 2048 \
@@ -129,19 +133,64 @@ Alternatively, it can also be built directly using ``trtllm-build`` command, see
 
 Known issues
 ^^^^^^^^^^^^
-* Currently in NeMo, quantizing and building TensorRT-LLM engines is limited to single-node use cases.
-* The supported and tested model family is Llama2. Quantizing other model types is experimental and may not be fully supported.
+* Currently with ``nemo.export`` module building TensorRT-LLM engines for quantized "qnemo" models is limited to single-node deployments.
 
 
-Please refer to the following papers for more details on quantization techniques.
+Quantization-Aware Training (QAT)
+---------------------------------
 
-References
-----------
+QAT is the technique of fine-tuning a quantized model to recover model quality degradation due to quantization.
+During QAT, the quantization scaling factors computed during PTQ are frozen and the model weights are fine-tuned.
+While QAT requires much more compute resources than PTQ, it is highly effective in recovering model quality.
+To perform QAT on a calibrated model from PTQ, you need to further fine-tune the model on a downstream task using a small dataset before exporting to TensorRT-LLM.
+You can reuse your training pipeline for QAT.
+As a rule of thumb, we recommend QAT for 1-10% original training duration and a small learning rate, e.g. 1e-5 for Adam optimizer.
+If you are doing QAT on an SFT model where learning rates and finetuning dataset size are already small, you can continue using the same SFT learning rate and dataset size as a starting point for QAT.
+Since QAT is done after PTQ, the supported model families are the same as for PTQ.
+
+
+Example
+^^^^^^^
+
+The example below shows how to perform PTQ and QAT on a Supervised Finetuned Llama2 7B model to INT4 precision.
+The script is tested using tensor parallelism of 8 on 8x RTX 6000 Ada 48GB GPUs. Alternatively, a single DGX A100 node with 8x 40GB GPUs can be used for the same purpose.
+For bigger models like Llama2 70B, you may need to use one or more DGX H100 nodes with 8x 80GB GPUs each.
+
+The example is a modified version of the `SFT with Llama 2 playbook <https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/llama2sft.html>`_.
+Please refer to the playbook for more details on setting up a BF16 NeMo model and the ``databricks-dolly-15k`` instruction dataset.
 
-`Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation, 2020 <https://arxiv.org/abs/2004.09602>`_
+First we will run the SFT example command from the playbook as-is to train a Llama2 7B SFT model for 100 steps.
+Make sure to change ``trainer.max_steps=50`` to ``trainer.max_steps=100`` for the ``examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py`` script.
+This will take ~2 hours to produce a model checkpoint with validation loss approximately ``1.15`` that we will use for PTQ and QAT next.
 
-`FP8 Formats for Deep Learning, 2022 <https://arxiv.org/abs/2209.05433>`_
+For Quantization, we use a modified version of the sft script and config file which includes the quantization and TensorRT-LLM export support.
+Along with the new parameters, make sure to pass the same parameters you passed for SFT training except the model restore path will be the SFT output ``.nemo`` file.
+The below example command will perform PTQ on the SFT model checkpoint followed by SFT again (QAT) which can then be exported for TensorRT-LLM inference. The script will take ~2-3 hours to complete.
+
+.. code-block:: bash
+
+    torchrun --nproc-per-node 8 examples/nlp/language_modeling/tuning/megatron_gpt_qat.py \
+        trainer.num_nodes=1 \
+        trainer.devices=8 \
+        trainer.precision=bf16 \
+        trainer.max_steps=100 \
+        model.restore_from_path=<llama2-7b-sft-nemo-path> \
+        model.global_batch_size=128 \
+        quantization.algorithm=int4 \
+        # other parameters from sft training
+
+As you can see from the logs, the INT4 PTQ model has a validation loss of approximately ``1.31`` and the QAT model has a validation loss of approximately ``1.17`` which is very close to the BF16 model loss of ``1.15``.
+This script will produce a quantized ``.nemo`` checkpoint at the experiment manager log directory (in the config yaml file) that can be used for further training.
+It can also optionally produce an exported TensorRT-LLM engine directory or a ``.qnemo`` file that can be used for inference by setting the ``export`` parameters similar to the PTQ example.
+Note that you may tweak the QAT trainer steps and learning rate if needed to achieve better model quality.
+
+
+References
+----------
 
-`SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models, 2022 <https://arxiv.org/abs/2211.10438>`_
+Please refer to the following papers for more details on quantization techniques:
 
-`AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration, 2023 <https://arxiv.org/abs/2306.00978>`_
+* `Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation, 2020 <https://arxiv.org/abs/2004.09602>`_
+* `FP8 Formats for Deep Learning, 2022 <https://arxiv.org/abs/2209.05433>`_
+* `SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models, 2022 <https://arxiv.org/abs/2211.10438>`_
+* `AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration, 2023 <https://arxiv.org/abs/2306.00978>`_
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
@@ -96,13 +96,13 @@ This section details the steps to clone and install the Megatron Core.
     git checkout a5415fcfacef2a37416259bd38b7c4b673583675 && \
     pip install .
 
-Model Optimizer Installation
+TensorRT Model Optimizer Installation
 
-This final step involves installing the Model Optimizer package.
+This final step involves installing the TensorRT Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-modelopt[torch]~=0.11.0 --extra-index-url https://pypi.nvidia.com
+    pip install nvidia-modelopt[torch]~=0.13.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash