Update PTQ tests and ModelOpt version (NVIDIA#11095)

janekl · Hainan Xu · commit cb07f134361b · 2024-11-05T12:23:48.000-05:00
* Deprecate NeMo 1 PTQ tests except FP8

Signed-off-by: Jan Lasek &lt;janek.lasek@gmail.com&gt;

* Convert model right before testing it for FP8 PTQ

Signed-off-by: Jan Lasek &lt;janek.lasek@gmail.com&gt;

* Bump modelopt version

Signed-off-by: Jan Lasek &lt;janek.lasek@gmail.com&gt;

---------

Signed-off-by: Jan Lasek &lt;janek.lasek@gmail.com&gt;
Signed-off-by: Hainan Xu &lt;hainanx@nvidia.com&gt;
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -495,27 +495,20 @@ jobs:
   #       - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
   #         if: "failure()"
 
-  L2_PTQ_Llama2_Export_Only:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_Export_Only') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          quantization.algorithm=null \
-          export.save_path=/tmp/nlp_megatron_llama_export_only/ci_baseline
-
   L2_PTQ_Llama2_FP8:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_FP8') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
+        CUDA_VISIBLE_DEVICES=0 python scripts/checkpoint_converters/convert_llama_hf_to_nemo.py \
+          --input_name_or_path=/home/TestData/nlp/megatron_llama/llama-ci-hf-tiny \
+          --output_path=/tmp/nlp_megatron_llama/llama_ci.nemo \
+          --precision=16
+
         python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
+          model.restore_from_path=/tmp/nlp_megatron_llama/llama_ci.nemo \
           model.tensor_model_parallel_size=2 \
           trainer.devices=2 \
           quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
@@ -526,55 +519,8 @@ jobs:
           export.sample_output=False \
           export.save_path=/tmp/nlp_megatron_llama_eo/ci_fp8.qnemo
 
-  L2_PTQ_Llama2_INT8_SQ:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_PTQ_Llama2_INT8_SQ') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      TIMEOUT: 15
-      SCRIPT: |
-        python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-          quantization.algorithm=int8_sq \
-          quantization.num_calib_size=8 \
-          inference.batch_size=2 \
-          export.sample_output=False \
-          export.save_path=/tmp/nlp_megatron_llama_eo/ci_int8_sq.qnemo
-
-  # TODO: investigate int4_awq stuck issues and restore the test
-  #L2_PTQ_Llama2_INT4_AWQ:
-  #  needs: [cicd-test-container-setup]
-  #  runs-on: self-hosted-azure
-  #  timeout-minutes: 10
-  #  container:
-  #    image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
-  #    options:
-  #      # --user 0:128
-  #      --device=/dev/nvidia0
-  #      --gpus all
-  #      --shm-size=8g
-  #      --env TRANSFORMERS_OFFLINE=0
-  #      --env HYDRA_FULL_ERROR=1
-  #      --volume /mnt/datadrive/TestData:/home/TestData
-  #  steps:
-  #      - name: Checkout repository
-  #        uses: actions/checkout@v4
-  #      - run: |
-  #          python examples/nlp/language_modeling/megatron_gpt_ptq.py \
-  #          model.restore_from_path=/home/TestData/nlp/megatron_llama/llama_ci.nemo \
-  #          model.tensor_model_parallel_size=1 \
-  #          trainer.devices=1 \
-  #          quantization.calib_dataset=/home/TestData/nlp/test_quantization/test.json \
-  #          quantization.algorithm=int4_awq \
-  #          quantization.num_calib_size=8 \
-  #          inference.batch_size=2 \
-  #          export.save_path=/home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-  #
-  #          rm -rf /home/TestData/nlp/megatron_llama/ci_int4_awq.qnemo
-        #- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
-        #  if: "failure()"
+      AFTER_SCRIPT: |
+        rm -rf /tmp/nlp_megatron_llama_eo/ci_fp8.qnemo
 
   # OPTIONAL_L2_QAT_Llama2_INT4:
   #    needs: [cicd-test-container-setup]
@@ -4477,10 +4423,8 @@ jobs:
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_Pretraining
-      - L2_PTQ_Llama2_INT8_SQ
       - L2_PTQ_Llama2_FP8
       - L2_Community_LLM_Checkpoints_tests_Llama3
-      - L2_PTQ_Llama2_Export_Only
       - L2_Distill_Llama2
       - L2_Prune_Width_Llama2
       - L2_Speech_to_Text_AED
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -52,7 +52,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
-ARG MODELOPT_VERSION=0.17.0
+ARG MODELOPT_VERSION=0.19.0
 ARG MCORE_TAG=213c8a23fa9fe95d19eff0932a1e6e71767f0962
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
diff --git a/docs/source/starthere/intro.rst b/docs/source/starthere/intro.rst
@@ -102,7 +102,7 @@ This final step involves installing the TensorRT Model Optimizer package.
 
 .. code-block:: bash
 
-    pip install nvidia-modelopt[torch]~=0.17.0 --extra-index-url https://pypi.nvidia.com
+    pip install nvidia-modelopt[torch]~=0.19.0 --extra-index-url https://pypi.nvidia.com
 
 
 .. code-block:: bash