NVIDIA · ekmb · Jun 12, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 24, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -72,6 +72,10 @@ WORKDIR /tmp/nemo
 COPY requirements .
 RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done
 
+# install flash attention dependencies
+RUN pip install flash-attn
+RUN pip install triton==2.0.0.dev20221202
+
 # install k2, skip if installation fails
 COPY scripts /tmp/nemo/scripts/
 RUN INSTALL_MSG=$(/bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh); INSTALL_CODE=$?; \

diff --git a/README.rst b/README.rst
@@ -265,7 +265,6 @@ packaging is also needed:
 
   pip install -y packaging
 
-
 Transformer Engine
 ~~~~~~~~~~~~~~~~~~
 NeMo Megatron GPT has been integrated with `NVIDIA Transformer Engine <https://github.com/NVIDIA/TransformerEngine>`_
@@ -280,6 +279,15 @@ It is highly recommended to use the NVIDIA PyTorch or NeMo container if having i
 
 Transformer Engine requires PyTorch to be built with CUDA 11.8.
 
+
+Flash Attention
+~~~~~~~~~~~~~~~~~~~~
+Install flash-attn <https://github.com/HazyResearch/flash-attention> if you want to use Flash Attention to reduce memory overhead. 
+.. code-block:: bash
+
+  pip install flash-attn
+  pip install triton==2.0.0.dev20221202
+
 NeMo Text Processing
 ~~~~~~~~~~~~~~~~~~~~
 NeMo Text Processing, specifically (Inverse) Text Normalization, is now a separate repository `https://github.com/NVIDIA/NeMo-text-processing <https://github.com/NVIDIA/NeMo-text-processing>`_.

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -77,7 +77,7 @@ model:
   transformer_block_type: 'pre_ln' # Options ['pre_ln', 'post_ln', 'normformer']
   openai_gelu: False # Use OpenAI's GELU instead of the default GeLU
   normalize_attention_scores: True # Whether to scale the output Q * K^T by 1 / sqrt(hidden_size_per_head). This arg is provided as a configuration option mostly for compatibility with models that have been weight-converted from HF. You almost always want to se this to True.
-  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope']
+  position_embedding_type: 'learned_absolute' # Position embedding type. Options ['learned_absolute', 'rope', 'alibi', 'xpos', 'sandwich']
   rotary_percentage: 1.0 # If using position_embedding_type=rope, then the per head dim is multiplied by this.
   attention_type: 'multihead' # Attention type. Options ['multihead']
   share_embeddings_and_output_weights: True # Share embedding and output layer weights.

diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_peft_eval.py
@@ -127,6 +127,12 @@ def main(cfg) -> None:
         peft_model_cfg.data.test_ds = cfg.model.data.test_ds
         peft_model_cfg.activations_checkpoint_granularity = None
         peft_model_cfg.activations_checkpoint_method = None
+        peft_model_cfg.encoder_seq_length = cfg.model.data.test_ds.max_seq_length
+        if 'use_flash_attention' not in peft_model_cfg:
+            peft_model_cfg.use_flash_attention = False
+
+        if 'use_flash_attention' in cfg.model:
+            peft_model_cfg.use_flash_attention = cfg.model.use_flash_attention
 
     with open_dict(cfg):
         # update the config with the trained model config

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py
@@ -163,6 +163,7 @@ def __init__(
         fp8_amax_compute_algo='most_recent',
         reduce_amax=True,
         use_emha=False,
+        use_flash_attention=False,
     ):
         super(GPTModel, self).__init__(share_token_embeddings=share_embeddings_and_output_weights)
 
@@ -243,6 +244,7 @@ def __init__(
             fp8_amax_compute_algo=fp8_amax_compute_algo,
             reduce_amax=reduce_amax,
             use_emha=use_emha,
+            use_flash_attention=use_flash_attention,
         )
 
         if self.share_embeddings_and_output_weights:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -351,6 +351,7 @@ def model_provider_func(self, pre_process, post_process):
             fp8_amax_compute_algo=self.cfg.get('fp8_amax_compute_algo', 'most_recent'),
             reduce_amax=self.cfg.get('reduce_amax', True),
             use_emha=self.cfg.get('use_emha', False),
+            use_flash_attention=self.cfg.get('use_flash_attention', False),
         )
 
         return model