Revert "Revert Mcore update since it caused regression (#11791)" (#11799

) * Revert "Revert Mcore update since it caused regression (#11791)" This reverts commit 84b2bf0. * Fix Gemma2 Attention init args (#11792) * Use _get_mlp_module_spec from Megatron Core rather than redefine locally (#11834) * Use _get_mlp_module_spec from MCore rather than redefine Signed-off-by: Jan Lasek <[email protected]> * Apply isort and black reformatting Signed-off-by: janekl <[email protected]> * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py Co-authored-by: oliver könig <[email protected]> Signed-off-by: Jan Lasek <[email protected]> --------- Signed-off-by: Jan Lasek <[email protected]> Signed-off-by: janekl <[email protected]> Co-authored-by: janekl <[email protected]> Co-authored-by: oliver könig <[email protected]> * Bugfix for output_generation_logits in tensorrtllm (#11820) (#11833) Signed-off-by: Abhishree <[email protected]> Signed-off-by: Jan Lasek <[email protected]> Co-authored-by: Abhishree Thittenamane <[email protected]> --------- Signed-off-by: Jan Lasek <[email protected]> Signed-off-by: janekl <[email protected]> Signed-off-by: Abhishree <[email protected]> Co-authored-by: Ao Tang <[email protected]> Co-authored-by: Jan Lasek <[email protected]> Co-authored-by: janekl <[email protected]> Co-authored-by: Abhishree Thittenamane <[email protected]>
NVIDIA · Jan 16, 2025 · fe2ae82 · fe2ae82
1 parent 2db5dbb
commit fe2ae82
Show file tree

Hide file tree

Showing 28 changed files with 97 additions and 146 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -2938,7 +2938,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
-        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -2966,6 +2966,7 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
+        +model.attention_backend="unfused" \
         model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
@@ -4368,7 +4369,7 @@ jobs:
       with:
         RUNNER: self-hosted-azure
         SCRIPT: |
-          NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
+          python3 tests/collections/llm/megatron_mixtral_pretraining.py \
           --experiment-dir=/tmp/mixtral_pretrain_results \
           --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
 

diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml
@@ -1,73 +1,52 @@
 name: CI-Import-Check
 
 on:
-  push:
   pull_request:
     paths:
       - "**"
 
 # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
 jobs:
-
-  test-asr-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+  test-imports:
+    name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        collection: 
+          - asr
+          # - nlp # Currently broken
+          - tts
+        python: ['3.10', '3.11', '3.12']
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
-    - name: Update base dependencies
-      run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '${{ matrix.python }}' 
+    - name: Build wheel
       id: nemo-wheel
       run:  |
-        pip install Cython
-        # install test requirements
-        pip install -r requirements/requirements_test.txt
         # Build nemo as a wheel
         pip install build
-        python -m build --no-isolation --wheel
+        python -m build --wheel
+        
         # Preserve wheel location
         DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test ASR Domain Imports
-      run: |
-        # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
-        # Run import checks
-        python tests/core_ptl/check_imports.py --domain "asr"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
-  test-tts-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v2
-    - name: Update base dependencies
+        echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
+    
+    - name: Install NeMo + test dependencies
       run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
-      id: nemo-wheel
-      run:  |
-        pip install Cython
         # install test requirements
         pip install -r requirements/requirements_test.txt
-        # Build nemo as a wheel
-        pip install build
-        python -m build --no-isolation --wheel
-        # Preserve wheel location
-        DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test TTS Domain Imports
-      run: |
+        
         # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
+        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
+    
+    - name: Run ${{ matrix.collection }} checks
+      run: |
         # Run import checks
-        python tests/core_ptl/check_imports.py --domain "tts"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
+        python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"
+  
+
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -34,17 +34,12 @@ EOF
 WORKDIR /workspace
 
 # Install Mamba Dependancy
-ARG CAUSAL_CONV_TAG=v1.2.2.post1
+ARG CAUSAL_CONV_TAG=v1.2.2.post1 
+ARG MAMBA_TAG=v2.2.0
 
 RUN <<"EOF" bash -ex
 # Mamba dependancy installation
-
-git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \
-  cd causal-conv1d && \
-  python setup.py install && \
-  cd .. && \
-  rm -rf causal-conv1d
-
+MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG}
 EOF
 
 RUN pip install hatchling   # needed to install nemo-run
@@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.21.0
-ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
-
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
   --mount=type=bind,source=requirements,target=requirements \
@@ -65,23 +58,22 @@ RUN \
   --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
-"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
 "unstructured==0.14.9" \
 "llama-index==0.10.43" \
 "onnxscript @ git+https://github.com/microsoft/onnxscript" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
+EOF
 
-# Megatron Core installation
-git clone https://github.com/NVIDIA/Megatron-LM.git && \
-pushd Megatron-LM && \
-git checkout ${MCORE_TAG} && \
-  pushd megatron/core/datasets && \
-  make && \
-  popd && \
-popd
+ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
+RUN <<"EOF" bash -ex
+# Megatron-LM installation
+git clone https://github.com/NVIDIA/Megatron-LM.git
+pushd Megatron-LM
+git checkout ${MCORE_TAG} 
+pip install -e .
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
 # Install nvidia-resiliency-ext

diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
@@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     VALIDATION_DATASET_PATH= # Path to validation dataset 
     SAVE_DIR= # where the checkpoint and logs are saved
     mkdir -p $SAVE_DIR
-    export NVTE_FLASH_ATTN=0
     export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-    export NVTE_FUSED_ATTN=0
     
     python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
     --config-path=${CONFIG_PATH} \
@@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     model.post_process=False \
     model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
     model.micro_batch_size=8 \
+    model.attention_backend="unfused" \ 
     model.optim.lr=0.000005 \
     model.optim.sched.min_lr=0.00000001 \
     model.optim.sched.warmup_steps=100 \

diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh
@@ -20,7 +20,6 @@
 export WANDB_PROJECT=xxx
 export WANDB_RUN_ID=xxx
 export WANDB_RESUME=allow
-export NVTE_FUSED_ATTN=0 
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 

diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
@@ -18,6 +18,7 @@
 
 import torch
 from megatron.core import parallel_state
+from megatron.core.transformer.enums import AttnBackend
 from torch import nn
 
 from nemo.collections.llm.fn.activation import openai_gelu
@@ -53,6 +54,8 @@ class GemmaConfig(GPTConfig):
     # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
     # The present implementation is more in line with the official implementation
     layernorm_zero_centered_gamma: bool = True
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    attention_backend: AttnBackend = AttnBackend.flash
 
 
 @dataclass

diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
@@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
     return run.Config(GemmaModel, config=run.Config(GemmaConfig2B))
 
 

diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
@@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
     return run.Config(GemmaModel, config=run.Config(GemmaConfig7B))
 
 
@@ -173,8 +171,6 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
 
     return run.Partial(
         fn,

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -17,13 +17,11 @@
 try:
     from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
     from megatron.core.transformer.enums import AttnMaskType
     from megatron.core.transformer.identity_op import IdentityOp
-    from megatron.core.transformer.mlp import MLP, MLPSubmodules
-    from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
-    from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
     from megatron.core.transformer.spec_utils import ModuleSpec
     from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
@@ -57,7 +55,8 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec
     if not HAVE_MEGATRON_CORE:
         raise IMPORT_ERROR
 
-    mlp = _get_mlp_module_spec(num_experts=num_experts)
+    mlp = _get_mlp_module_spec(use_te=False, num_experts=num_experts, moe_grouped_gemm=False)
+
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(
@@ -84,35 +83,3 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec
             },
         ),
     )
-
-
-# Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
-    if num_experts is None:
-        # Dense MLP w/ or w/o TE modules.
-        return ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=ColumnParallelLinear,
-                linear_fc2=RowParallelLinear,
-            ),
-        )
-    else:
-        # Mixture of experts with modules in megatron core.
-        return ModuleSpec(
-            module=MoELayer,
-            submodules=MoESubmodules(
-                experts=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear,
-                    linear_fc2=RowParallelLinear,
-                ),
-                shared_experts=ModuleSpec(
-                    module=SharedExpertMLP,
-                    params={"gate": False},
-                    submodules=MLPSubmodules(
-                        linear_fc1=ColumnParallelLinear,
-                        linear_fc2=RowParallelLinear,
-                    ),
-                ),
-            ),
-        )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -50,6 +50,7 @@
 try:
     from megatron.core import ModelParallelConfig, parallel_state
     from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.transformer.enums import AttnBackend
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -538,6 +539,9 @@ def build_transformer_config(self) -> TransformerConfig:
 
         tp_only_amax_red = self.cfg.get('tp_only_amax_red', False)
 
+        attention_backend = self.cfg.get('attention_backend', "auto")
+        attention_backend = AttnBackend[attention_backend]
+
         # any configs that are not in the nemo model config will be added here
         config_mapping = {
             'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
@@ -562,6 +566,7 @@ def build_transformer_config(self) -> TransformerConfig:
             'rotary_interleaved': rotary_interleaved,
             'deallocate_pipeline_outputs': True,
             'tp_only_amax_red': tp_only_amax_red,
+            'attention_backend': attention_backend,
         }
 
         # populate the transformer config dict

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -76,6 +76,7 @@
     from megatron.core.models.retro.utils import get_config_path as get_retro_config_path
     from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.enums import AttnBackend
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -431,6 +432,8 @@ def build_retro_config(self) -> RetroConfig:
 
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version >= packaging.version.Version("1.3"):
+            if HAVE_MEGATRON_CORE:
+                retro_config.attention_backend = AttnBackend.unfused
             try:
                 os.environ["NVTE_FLASH_ATTN"] = "0"
                 os.environ["NVTE_FUSED_ATTN"] = "0"

diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py
@@ -390,7 +390,7 @@ def sharded_state_dict(
         layer_prefix = f'{prefix}layers.'
         num_layers = self.config.num_layers
         for layer in self.layers:
-            offset = layer._get_layer_offset()
+            offset = layer._get_layer_offset(layer.config)
             global_layer_offset = layer.layer_number - 1  # self.layer_number starts at 1
             state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
             sharded_prefix = layer_prefix
@@ -403,7 +403,7 @@ def sharded_state_dict(
         for xlayer in self.xattn_layers:
             if isinstance(xlayer, DummyCrossAttentionTransformerLayer):
                 continue
-            offset = xlayer._get_layer_offset()
+            offset = xlayer._get_layer_offset(xlayer.config)
             global_layer_offset = xlayer.layer_number - 1
             state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
             sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.'

diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
@@ -480,7 +480,7 @@ def load_checkpoint(
         if getattr(path, "base_model_path", None):
             ## PEFT Resume, FIRST TIME
             self.adapter_ckpt_path = Path(str(path))
-            adapter_ckpt = self.checkpoint_io.load_checkpoint(path)  # Loads only metadata
+            adapter_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict={})  # Loads only metadata
             # path is adapter path to restore the training metadata, but switch to loading base model here.
             path = self.model_ckpt_path = path.base_model_path
         elif adapter_meta_path.exists():