diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
index f61978163..9160fccab 100644
--- a/.github/workflows/cpu_ci.yml
+++ b/.github/workflows/cpu_ci.yml
@@ -4,7 +4,8 @@ on: "push"
 
 jobs:
   run-tests:
-    runs-on: ubuntu-latest
+    #runs-on: ubuntu-latest
+    runs-on: [ 'test', 'self-hosted' ]
     steps:
       - uses: actions/checkout@v3
 
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 8ee2f2a62..3213718df 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -4,12 +4,12 @@ on: [pull_request]
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3.10
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
       # Need the right version of clang-format
@@ -24,7 +24,7 @@ jobs:
         uses: docker/build-push-action@v2
 
   update-documentation:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
         with:
diff --git a/.gitignore b/.gitignore
index 34de4e774..dbc83e949 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,6 +137,7 @@ data/**/*.bin
 data/**/*.json*
 data/**/*.txt
 data/**/*.gz
+data/**/*.zip
 data/**/*.np*
 data/**/*.npy
 checkpoints/
@@ -150,3 +151,7 @@ test_logs/
 logs/
 tensorboard/
 src/
+
+# test data files
+tests/data/*.bin
+tests/data/*.idx
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e9ca3920d..0eaf9b13c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,10 +16,10 @@ repos:
             args: [--fix=lf]
           - id: requirements-txt-fixer
           - id: trailing-whitespace
-    - repo: https://gitlab.com/daverona/pre-commit-cpp
+    - repo: https://gitlab.com/daverona/pre-commit/cpp
       rev: 0.8.0
       hooks:
-          - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
+          - id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
             args: []
 
     - repo: https://github.com/psf/black
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..ee633e8c5
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,86 @@
+# Contributing
+GPT-NeoX welcomes your contributions!
+
+## Prerequisites
+GPT-NeoX uses [pre-commit](https://pre-commit.com/) to ensure that formatting is
+consistent across GPT-NeoX. First, ensure that `pre-commit` is installed with
+`pip install pre-commit`. Next, the pre-commit hooks must be installed once
+before commits can be made:
+```bash
+pre-commit install
+```
+Please install `clang-format` from Conda:
+```bash
+conda install clang-format
+```
+
+Afterwards, our suite of formatting tests run automatically before each `git commit`. You
+can also run these manually:
+```bash
+pre-commit run --all-files
+```
+If a formatting test fails, it will fix the modified code in place and abort
+the `git commit`. After looking over the changes, you can `git add <modified files>`
+and then repeat the previous `git commit` command.
+
+
+## Testing
+GPT-NeoX tracks two types of tests: unit tests and more costly model convergence tests.
+Unit tests are found in `tests/unit/` and the model convergence tests are found in
+`tests/model/`.
+
+### Unit Tests
+[PyTest](https://docs.pytest.org/en/latest/) is used to execute tests. PyTest can be
+installed from PyPI via `pip install pytest`. Simply invoke `pytest --forked` to run the
+unit tests:
+```bash
+pytest --forked tests/unit/
+```
+You can also provide the `-v` flag to `pytest` to see additional information about the
+tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
+`--forked` flag are required to test CUDA functionality in distributed tests.
+
+### Model Tests
+To execute model tests, first install GPT-NeoX. Next, execute the model test driver:
+```bash
+cd tests/model/
+pytest run_sanity_check.py
+```
+Note that the `--forked` flag is not necessary for the model tests.
+
+## Contributor License Agreement
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
+actually do, grant us the rights to use your contribution. For details, visit
+https://cla-assistant.io/EleutherAI/gpt-neox.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
+follow the instructions provided by the bot. You will only need to do this once across
+all repos using our CLA.
+
+## New Feature Contribution Guidelines
+Unlike bug fix or improving existing feature (where users usually directly submit a PR and we review it), adding a new feature to GPT-NeoX requires several steps: (1) proposal and discussion, (2) implementation and verification, (3) release and maintenance. This general guideline applies to all new feature contributions. Core GPT-NeoX team member contributions may complete step 1 internally.
+
+### Step 1: Proposal and Discussion
+We ask users to first post your intended feature in an issue. This issue needs to include:
+
+* A description of the proposed feature.
+* A motivation of why it will be useful to GPT-NeoX users.
+* A rough design of how you implement the feature inside GPT-NeoX.
+* (Important) Results or planned experiments to demonstrate the effectiveness and correctness of the feature.
+  * If the feature only affects performance and does not affect training convergence, we require testing on a fraction of training to demonstrate that the training/validation loss are consistent with baseline, and that the performance is better than baseline.
+  * If the feature does affect training convergence, we require testing the whole training to demonstrate that the feature achieves better/on-par final model quality and training performance compared to baseline.
+
+Based on the issue we shall discuss the merit of the new feature and decide whether to accept or decline the proposal. Once accepted and after we confirm the design and implementation plan, we are ready for step 2.
+
+### Step 2: Implementation and Verification
+The contributor will proceed and implement the feature, and the GPT-NeoX team will provide guidance/helps as needed. The required deliverables include:
+
+* A PR to [EleutherAI/GPT-NeoX](https://github.com/EleutherAI/gpt-neox) including (1) the feature implementation (2) unit tests (3) documentation (4) example usage.
+* In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.
+
+After receiving the PRs, we will review them and merge them after necessary tests/fixes.
+
+### Step 3: Release and Maintenance
+After the PRs are merged, we will announce the feature on our website (with credit to the feature author). We ask the feature author to commit to the maintenance of the feature.
diff --git a/Dockerfile b/Dockerfile
index bd30c11c3..570c95f2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvidia/cuda:11.7.1-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.1-devel-ubuntu22.04
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -21,20 +21,20 @@ LABEL org.opencontainers.image.version = "2.0"
 LABEL org.opencontainers.image.authors = "contact@eleuther.ai"
 LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
 LABEL org.opencontainers.image.licenses = " Apache-2.0"
-LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:11.7.1-devel-ubuntu20.04"
+LABEL org.opencontainers.image.base.name="docker.io/nvidia/cuda:12.1.1-devel-ubuntu22.04"
 
 #### System package (uses default Python 3 version in Ubuntu 20.04)
 RUN apt-get update -y && \
     apt-get install -y \
-    git python3.9 python3-dev libpython3-dev python3-pip sudo pdsh \
-    htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
+    git python3-dev libpython3-dev python3-pip sudo pdsh \
+    htop tmux zstd software-properties-common build-essential autotools-dev \
     nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
     rsync iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils ibverbs-utils \
     rdmacm-utils perftest rdma-core nano && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    pip install --upgrade pip && \
-    pip install gpustat
+    python -m pip install --upgrade pip && \
+    python -m pip install gpustat
 
 ### SSH
 RUN mkdir /var/run/sshd && \
@@ -88,24 +88,31 @@ RUN mkdir -p /home/mchorse/.ssh /job && \
     echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
 
 #### Python packages
-RUN pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 && pip cache purge
-COPY requirements/requirements.txt .
-COPY requirements/requirements-wandb.txt .
-COPY requirements/requirements-onebitadam.txt .
-COPY requirements/requirements-sparseattention.txt .
-COPY requirements/requirements-flashattention.txt .
-RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt
-RUN pip install -r requirements-sparseattention.txt
-RUN pip install -r requirements-flashattention.txt
-RUN pip install -r requirements-wandb.txt
-RUN pip install protobuf==3.20.*
-RUN pip cache purge
+RUN python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+COPY requirements/* ./
+RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt
+RUN python -m pip install -r requirements-sparseattention.txt
+RUN python -m pip install -r requirements-flashattention.txt
+RUN python -m pip install -r requirements-wandb.txt
+RUN python -m pip install protobuf==3.20.*
+RUN python -m pip cache purge
 
 ## Install APEX
-RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
+# Detect the architecture and install Apex accordingly
+RUN ARCH=$(uname -m) && \
+    if [ "$ARCH" = "x86_64" ]; then \
+        wget https://github.com/segyges/not-nvidia-apex/releases/download/jan-2024/apex-0.1-cp310-cp310-linux_x86_64.zip && \
+        unzip ./apex-0.1-cp310-cp310-linux_x86_64.zip && \
+        python -m pip install ./apex-0.1-cp310-cp310-linux_x86_64.whl; \
+    else \
+    # Install Apex directly from source for other architectures
+        python -m pip install -r requirements-apex-pip.txt && \
+        python -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings --global-option=--cpp_ext --config-settings --global-option=--cuda_ext git+https://github.com/NVIDIA/apex.git@141bbf1cf362d4ca4d94f4284393e91dda5105a5; \
+    fi
 
 COPY megatron/fused_kernels/ megatron/fused_kernels
-RUN python megatron/fused_kernels/setup.py install
+WORKDIR /megatron/fused_kernels
+RUN python setup.py install
 
 # Clear staging
 RUN mkdir -p /tmp && chmod 0777 /tmp
diff --git a/LICENSE b/LICENSE
index 99cf99888..b7224a614 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,5 @@
                                  Apache License
-                           Version 2.0, January 2004
+                           Version 2.0, January 2024
                         http://www.apache.org/licenses/
 
    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
@@ -211,7 +211,7 @@ used in those files, as indicated.
 ------------- LICENSE FOR NVIDIA code  --------------
 
 
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -241,7 +241,7 @@ used in those files, as indicated.
 
 
                                  Apache License
-                           Version 2.0, January 2004
+                           Version 2.0, January 2024
                         http://www.apache.org/licenses/
 
    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
diff --git a/README.md b/README.md
index f82bedec0..721306c22 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,7 @@ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherA
   * [Weights and Biases](#weights-and-biases)
   * [TensorBoard](#tensorboard)
 - [Running on multi-node](#running-on-multi-node)
+- [Profiling](#profiling)
 - [Adoption and Publications](#adoption-and-publications)
   * [Publications](#publications)
   * [Models](#models)
@@ -76,6 +77,7 @@ Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherA
     + [Other Modalities](#other-modalities)
 - [Administrative Notes](#administrative-notes)
   * [Citing GPT-NeoX](#citing-gpt-neox)
+  * [Contributing](#contributing)
   * [Licensing](#licensing)
   * [Acknowledgements](#acknowledgements)
 
@@ -500,18 +502,21 @@ where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--ev
 
 # Exporting to Hugging Face
 
-GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) GPTNeoXModel format.
+GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) format.
 
-To convert a NeoX checkpoint (with pipeline-parallel-size>=1) to Hugging Face-loadable format, run:
-```bash
-python ./tools/ckpts/convert_module_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
-```
+Though NeoX supports a number of different architectural configurations, including AliBi positional embeddings, not all of these configurations map cleanly onto the supported configurations within Hugging Face Transformers.
+
+NeoX supports export of compatible models into the following architectures:
+- GPTNeoXForCausalLM
+- LlamaForCausalLM
+- MistralForCausalLM
+
+Training a model which does not fit into one of these Hugging Face Transformers architectures cleanly will require writing custom modeling code for the exported model.
 
-To convert a sequential model to Hugging Face format, run:
+To convert a GPT-NeoX library checkpoint to Hugging Face-loadable format, run:
 ```bash
-python  ./tools/ckpts/convert_sequential_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location
+python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,mistral,llama}
 ```
-(Note: this script should be used for v2.0 checkpoints saved on a v2.0 commit prior to https://github.com/EleutherAI/gpt-neox/pull/866 and which used `pipe-parallel-size=1`. Using `pipe-parallel-size=0` will also save models in this format.)
 
 Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
 ```bash
@@ -520,7 +525,27 @@ python ./tools/ckpts/upload.py
 ```
 and input the requested information, including HF hub user token.
 
-Note, however, that this compatibility is not one-to-one, and only certain configurations from GPT-NeoX are supported in the Hugging Face GPTNeoXModel class. Advanced features such as alternative positional embeddings may require new Transformers modeling code and new conversion script tweaks.
+### Importing Models Into GPT-NeoX
+
+NeoX supplies several utilities for converting a pretrained model checkpoint into a format that can be trained within the library.
+
+The following models or model families can be loaded in GPT-NeoX:
+- Llama 1
+- Llama 2
+- CodeLlama
+- Mistral-7b-v0.1
+
+We provide two utilities for converting from two different checkpoint formats into a format compatible with GPT-NeoX.
+
+To convert a Llama 1 or Llama 2 checkpoint distributed by Meta AI from its original file format (downloadable [here](https://github.com/facebookresearch/llama) or [here](https://huggingface.co/meta-llama/Llama-2-7b)) into the GPT-NeoX library, run
+
+```
+python tools/ckpts/convert_raw_llama_weights_to_neox.py --input_dir /path/to/model/parent/dir/7B --model_size 7B --output_dir /path/to/save/ckpt --num_output_shards <TENSOR_PARALLEL_SIZE> (--pipeline_parallel if pipeline-parallel-size >= 1)
+```
+
+
+To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/convert_hf_to_sequential.py`. See documentation within that file for further options.
+
 
 # Monitoring
 
@@ -538,6 +563,36 @@ We also support using TensorBoard via the <code><var>tensorboard-dir</var></code
 
 If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
 
+# Profiling
+
+We support profiling with Nsight Systems and PyTorch Memory Profiling.
+
+## Nsight Systems Profiling
+
+To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop`. Launch training with:
+
+```
+nsys profile -s none -t nvtx,cuda -o <path/to/profiling/output> --force-overwrite true \
+--capture-range=cudaProfilerApi --capture-range-end=stop python $TRAIN_PATH/deepy.py \
+$TRAIN_PATH/train.py --conf_dir configs <config files>
+```
+
+The generated output file can then by viewed with the Nsight Systems GUI:
+
+![Alt text](images/nsight_profiling.png)
+
+## PyTorch Memory Profiling
+
+To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
+
+![Alt text](images/memory_profiling.png)
+
+View the generated profile with the [memory_viz.py](https://github.com/pytorch/pytorch/blob/main/torch/cuda/_memory_viz.py) script. Run with:
+
+```
+python _memory_viz.py trace_plot <generated_profile> -o trace.html
+```
+
 # Adoption and Publications
 
 The GPT-NeoX library was been widely adopted by academic and industry researchers and ported on to many HPC systems.
@@ -637,9 +692,14 @@ To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
 }
 ```
 
+## Contributing
+GPT-NeoX is built by the open-source AI community, and relies on our amazing contributors! Please see our
+[contributing](CONTRIBUTING.md) guide for more details on our CLA, code formatting, testing,
+etc.
+
 ## Licensing
 
-This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2021, EleutherAI. Licensed under the Apache License:
+This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2024, EleutherAI. Licensed under the Apache License:
 
     Licensed under the Apache License, Version 2.0 (the "License");
     you may not use this file except in compliance with the License.
diff --git a/configs/1-3B.yml b/configs/1-3B.yml
index 31c316e2e..ea3fdb9bf 100644
--- a/configs/1-3B.yml
+++ b/configs/1-3B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/125M-json.yml b/configs/125M-json.yml
index fff863f44..467318f24 100644
--- a/configs/125M-json.yml
+++ b/configs/125M-json.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   "init_method": "small_init",
   "output_layer_init_method": "wang_init",
diff --git a/configs/125M.yml b/configs/125M.yml
index a098c6df6..466492f7b 100644
--- a/configs/125M.yml
+++ b/configs/125M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/13B.yml b/configs/13B.yml
index 93592896b..99caab585 100644
--- a/configs/13B.yml
+++ b/configs/13B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/175B.yml b/configs/175B.yml
index 99efdc6e0..4d011f1b4 100644
--- a/configs/175B.yml
+++ b/configs/175B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/19M.yml b/configs/19M.yml
index 148655efc..c14ebe8ea 100644
--- a/configs/19M.yml
+++ b/configs/19M.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",
@@ -43,7 +44,7 @@
   },
 
   "train_micro_batch_size_per_gpu": 4, #32,
-  "gas": 1,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
diff --git a/configs/2-7B.yml b/configs/2-7B.yml
index ffcd3af95..9e6a47e15 100644
--- a/configs/2-7B.yml
+++ b/configs/2-7B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/20B.yml b/configs/20B.yml
index c55d342b9..0a4ce6335 100644
--- a/configs/20B.yml
+++ b/configs/20B.yml
@@ -31,6 +31,7 @@
   "scaled_upper_triang_masked_softmax_fusion": true,
   "bias_gelu_fusion": true,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",
diff --git a/configs/350M.yml b/configs/350M.yml
index 755f11c63..00a174433 100644
--- a/configs/350M.yml
+++ b/configs/350M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/49M.yml b/configs/49M.yml
index e30bc3ebb..099af6a48 100644
--- a/configs/49M.yml
+++ b/configs/49M.yml
@@ -19,6 +19,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",
@@ -48,7 +49,7 @@
 
   # batch / data settings
   "train_micro_batch_size_per_gpu": 32,
-  "gas": 1,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
diff --git a/configs/6-7B.yml b/configs/6-7B.yml
index a3578a014..087b7d763 100644
--- a/configs/6-7B.yml
+++ b/configs/6-7B.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/760M.yml b/configs/760M.yml
index 0eb6aea31..6d62dc0f3 100644
--- a/configs/760M.yml
+++ b/configs/760M.yml
@@ -21,6 +21,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # init methods
    "init_method": "small_init",
diff --git a/configs/800M.yml b/configs/800M.yml
index 28396d58d..4fb9559a8 100644
--- a/configs/800M.yml
+++ b/configs/800M.yml
@@ -16,6 +16,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # init methods
   "init_method": "small_init",
@@ -43,7 +44,7 @@
   },
 
   "train_micro_batch_size_per_gpu": 16,
-  "gas": 1,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
diff --git a/configs/bf16_125M.yml b/configs/bf16_125M.yml
index eac251cae..87e86e7fb 100644
--- a/configs/bf16_125M.yml
+++ b/configs/bf16_125M.yml
@@ -19,6 +19,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
 
    # optimizer settings
diff --git a/configs/bnb_125M.yml b/configs/bnb_125M.yml
index 619b0084a..523b10c39 100644
--- a/configs/bnb_125M.yml
+++ b/configs/bnb_125M.yml
@@ -20,6 +20,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
 
    # optimizer settings
diff --git a/configs/docker/paths.yml b/configs/docker/pythia-paths.yml
similarity index 100%
rename from configs/docker/paths.yml
rename to configs/docker/pythia-paths.yml
diff --git a/configs/gen_docs.py b/configs/gen_docs.py
index 08431e6c6..1d8c5ba3b 100644
--- a/configs/gen_docs.py
+++ b/configs/gen_docs.py
@@ -31,7 +31,10 @@ def get_docs(module):
             field_name, field_def = cur
             field_type = field_def.type
             if hasattr(field_type, "__name__"):
-                field_type = field_type.__name__
+                if field_type.__name__ == "Literal" or field_type.__name__ == "Union":
+                    field_type = field_type
+                else:
+                    field_type = str(field_type.__name__)
             else:
                 field_type = str(field_type)
 
diff --git a/configs/llama2/13B.yml b/configs/llama2/13B.yml
new file mode 100644
index 000000000..973b8bea4
--- /dev/null
+++ b/configs/llama2/13B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 40,
+  "hidden_size": 5120,
+  "num_attention_heads": 40,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/70B.yml b/configs/llama2/70B.yml
new file mode 100644
index 000000000..286a1b568
--- /dev/null
+++ b/configs/llama2/70B.yml
@@ -0,0 +1,31 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 80,
+  "hidden_size": 8192,
+  "intermediate_size": 28672,
+  "num_attention_heads": 64,
+  "num_kv_heads": 8,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/7B.yml b/configs/llama2/7B.yml
new file mode 100644
index 000000000..6a5c97e64
--- /dev/null
+++ b/configs/llama2/7B.yml
@@ -0,0 +1,26 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/codellama_34B.yml b/configs/llama2/codellama_34B.yml
new file mode 100644
index 000000000..88e9afaf6
--- /dev/null
+++ b/configs/llama2/codellama_34B.yml
@@ -0,0 +1,32 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "num_kv_heads": 8,
+  # Codellama was uptrained on 16k token sequence lengths
+  # with rotary_emb_base adjusted to 1_000_000.
+  "seq_length": 16384,
+  "max_position_embeddings": 16384,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llama2/codellama_7B.yml b/configs/llama2/codellama_7B.yml
new file mode 100644
index 000000000..be123ebee
--- /dev/null
+++ b/configs/llama2/codellama_7B.yml
@@ -0,0 +1,31 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  # Codellama was uptrained on 16k token sequence lengths
+  # with rotary_emb_base adjusted to 1_000_000.
+  "seq_length": 16384,
+  "max_position_embeddings": 16384,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+}
diff --git a/configs/llemma/34B.yml b/configs/llemma/34B.yml
new file mode 100644
index 000000000..bd72d7e23
--- /dev/null
+++ b/configs/llemma/34B.yml
@@ -0,0 +1,101 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 8,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 48,
+  "hidden_size": 8192,
+  "num_attention_heads": 64,
+  "attention_type": "groupedquery",
+  "num_kv_heads": 8,
+  # NB: These rotary embedding and sequence length parameters
+  # May differ from CodeLlama configs. They match what we used for
+  # Llemma continued pretraining. See https://arxiv.org/abs/2310.10631
+  # For detailed discussion
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 1000000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 48]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00005,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # trained on 256 gpus
+  "train_micro_batch_size_per_gpu": 2,
+  "gradient_accumulation_steps": 16,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 12000,
+  "lr_decay_iters": 12000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "min_lr": 1.65e-6,
+  "warmup": 0.042, # warmup for ~500 iters
+  "checkpoint_factor": 250,
+  "eval_interval": 250,
+  "eval_iters": 25,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "SPMTokenizer",
+  #"vocab-file": # use 'tokenizer.model' from Meta CodeLlama download
+
+  # "load": "" # set to same as "save" to resume from intermediate finetuning step
+  #"load": MP=8 CodeLlama-34B checkpoint, converted from Meta CodeLlama download.
+  # When resuming from mid-finetuning run, change "load" to the same as save location.
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+}
diff --git a/configs/llemma/7B.yml b/configs/llemma/7B.yml
new file mode 100644
index 000000000..fb72c8c18
--- /dev/null
+++ b/configs/llemma/7B.yml
@@ -0,0 +1,100 @@
+{
+  "pipe_parallel_size": 0,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "num_attention_heads": 32,
+  # NB: These rotary embedding and sequence length parameters
+  # May differ from CodeLlama configs. They match what we used for
+  # Llemma continued pretraining. See https://arxiv.org/abs/2310.10631
+  # For detailed discussion
+  "seq_length": 4096,
+  "max_position_embeddings": 4096,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+
+  # trained on 256 gpus
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 2,
+  "data_impl": "mmap",
+
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+
+  "precision": "bfloat16",
+  "fp32_allreduce": true,
+  "bf16": {
+    "enabled": true
+  },
+  "data_types": {
+    "grad_accum_dtype": "fp32"
+  },
+
+  "train_iters": 48000,
+  "lr_decay_iters": 48000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "decay_lr_to": 0.033,
+  "warmup_iters": 500,
+  "checkpoint_factor": 500,
+  "eval_interval": 250,
+  "eval_iters": 50,
+
+  "log_interval": 1,
+  "steps_per_print": 1,
+  "wall_clock_breakdown": true,
+
+  "tokenizer_type": "SPMTokenizer",
+  "vocab-file": "codellama/tokenizer.model", # use tokenizer.model from Meta CodeLlama download
+
+  "save": "/path/to/save/llema-replication",
+  #"load": "", # once run is started, to restart from intermediate ckpt use "load" = "save"
+  "load": "/path/to/converted/codellama_7b_weights_with_mp2",
+
+  "finetune": true, # set to false once resuming from intermediate finetuning step
+}
diff --git a/configs/mistral/7B.yml b/configs/mistral/7B.yml
new file mode 100644
index 000000000..587fe5d36
--- /dev/null
+++ b/configs/mistral/7B.yml
@@ -0,0 +1,42 @@
+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "make_vocab_size_divisible_by": 1,
+
+  # model settings
+  "num_layers": 32,
+  "hidden_size": 4096,
+  "intermediate_size": 14336,
+  "num_attention_heads": 32,
+  "num_kv_heads": 8,
+  # per Mistral, Mistral-7B-v0.1 was pretrained with 8192 seqlen
+  # and instruction tuned to 16384 seqlen, all with 4096 sliding window
+  "seq_length": 8192,
+  "sliding_window_width": 4096,
+  "max_position_embeddings": 131072,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "rotary_emb_base": 10000,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-5,
+
+  # Grouped Query Attention is supported for both default ("global")
+  # and Flash attention. However, we highly recommend the use of Flash attention
+  # to get FLOP + runtime speedups when using GQA,
+  # and sliding window attention is currently only supported by Flash attention.
+  "attention_config": [[["flash"], 32]],
+
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "mlp_type": "llama",
+  "activation": "silu",
+
+  "tokenizer_type": "SPMTokenizer",
+  #"vocab-file": ".../mistral-7B-v0.1/tokenizer.model", # use tokenizer.model from Mistral-7B-v0.1 direct download
+
+}
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 864fdd4ce..edf7f83c5 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = a300979
+    Default = 11a2e9b
 
     current git hash of repository
 
@@ -199,6 +199,54 @@ Logging Arguments
 
 
 
+- **memory_profiling**: bool
+
+    Default = False
+
+    Whether to take a memory snapshot of the model. Useful for debugging memory issues.
+
+
+
+- **memory_profiling_path**: str
+
+    Default = None
+
+    Path to save memory snapshot to.
+
+
+
+- **profile**: bool
+
+    Default = False
+
+    Enable nsys profiling. When using this option,
+    nsys options should be specified in commandline.
+    An example nsys commandline is
+    ```
+    nsys profile -s none -t nvtx,cuda -o <path/to/output_file>
+    --force-overwrite true
+    --capture-range=cudaProfilerApi
+    --capture-range-end=stop
+    ```
+
+
+
+- **profile_step_start**: int
+
+    Default = 10
+
+    Step to start profiling at.
+
+
+
+- **profile_step_stop**: int
+
+    Default = 12
+
+    Step to stop profiling at.
+
+
+
 ## NeoXArgsModel
 
 Model Arguments
@@ -229,12 +277,38 @@ Model Arguments
 
 
 
+- **intermediate_size**: int
+
+    Default = None
+
+    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+
+    If not passed, will be set to a reasonable default.
+
+
+
 - **num_attention_heads**: int
 
     Default = None
 
     Number of transformer attention heads.
 
+    If num_kv_heads is set, will control only number of query heads.
+
+
+
+- **num_kv_heads**: int
+
+    Default = None
+
+    Number of transformer key/value attention heads.
+
+    If set to None or the same value as num_attention_heads, will perform multi-head attention (MHA).
+    If set to < num_attention_heads but > 1, will perform grouped-query attention (GQA) (https://arxiv.org/pdf/2305.13245.pdf)
+    If set to 1, will perform multi-query attention.
+
+    Must be < num_attention_heads and divide num_attention_heads evenly.
+
 
 
 - **seq_length**: int
@@ -245,6 +319,14 @@ Model Arguments
 
 
 
+- **sliding_window_width**: int
+
+    Default = None
+
+    Width of the attention sliding window. Only supported with Flash Attention 2.
+
+
+
 - **max_position_embeddings**: int
 
     Default = None
@@ -261,6 +343,14 @@ Model Arguments
 
 
 
+- **layernorm_fusion**: bool
+
+    Default = False
+
+    Use fused layer norm kernel (if `norm` is `layernorm`).
+
+
+
 - **use_qk_layernorm**: bool
 
     Default = False
@@ -1022,6 +1112,8 @@ Text Generation arguments
 
     Tasks to evaluate on using lm_eval_harness
 
+    NOTE: Requires internet connection
+
 
 
 ## NeoXArgsTokenizer
@@ -1468,14 +1560,6 @@ Training Arguments
 
 
 
-- **gas**: int
-
-    Default = None
-
-    gradient_accumulation_steps
-
-
-
 - **clip_grad**: float
 
     Default = 1.0
diff --git a/configs/slurm_125M.yml b/configs/slurm_125M.yml
index 798b43629..2ac60e534 100644
--- a/configs/slurm_125M.yml
+++ b/configs/slurm_125M.yml
@@ -12,6 +12,7 @@
    "scaled_upper_triang_masked_softmax_fusion": true,
    "bias_gelu_fusion": true,
    "rope_fusion": false,
+   "layernorm_fusion": false,
    "optimizer": {
      "type": "Adam",
      "params": {
diff --git a/deepy.py b/deepy.py
index eacbd9dec..e4dc7a3fd 100755
--- a/deepy.py
+++ b/deepy.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/docker-compose-dockerhub.yml b/docker-compose-dockerhub.yml
index 0207e63f0..4ac5113f7 100644
--- a/docker-compose-dockerhub.yml
+++ b/docker-compose-dockerhub.yml
@@ -1,7 +1,7 @@
 version: '3'
 services:
   gpt-neox:
-    command: nvidia-smi -q --loop=10
+    command: nvidia-smi dmon
     image: leogao2/gpt-neox:main
     shm_size: 1g
     ulimits:
diff --git a/docker-compose.yml b/docker-compose.yml
index a12213e00..b2591aed0 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,7 +1,7 @@
 version: '3'
 services:
   gpt-neox:
-    command: nvidia-smi -q --loop=10
+    command: nvidia-smi dmon
     image: gpt-neox
     build:
       context: .
diff --git a/eval.py b/eval.py
index 7b7a74a6f..93093f21d 100644
--- a/eval.py
+++ b/eval.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/eval_tasks/__init__.py b/eval_tasks/__init__.py
index d17c7458e..6cb4e30ca 100644
--- a/eval_tasks/__init__.py
+++ b/eval_tasks/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/eval_tasks/eval_adapter.py b/eval_tasks/eval_adapter.py
index 63c4a4a72..abbd5ca8d 100644
--- a/eval_tasks/eval_adapter.py
+++ b/eval_tasks/eval_adapter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/generate.py b/generate.py
index 100950f00..743e350d0 100755
--- a/generate.py
+++ b/generate.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
-# Copyright (c) 2021 EleutherAI
+# Copyright (c) 2024 EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/images/memory_profiling.png b/images/memory_profiling.png
new file mode 100644
index 000000000..87d62f4c3
Binary files /dev/null and b/images/memory_profiling.png differ
diff --git a/images/nsight_profiling.png b/images/nsight_profiling.png
new file mode 100644
index 000000000..36ecc6568
Binary files /dev/null and b/images/nsight_profiling.png differ
diff --git a/megatron/__init__.py b/megatron/__init__.py
index 8cd7804c9..fc254fe3d 100644
--- a/megatron/__init__.py
+++ b/megatron/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index bbaa0f7f9..12b81e202 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py
index e05c58429..0392841cb 100644
--- a/megatron/data/blendable_dataset.py
+++ b/megatron/data/blendable_dataset.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
index 513dd0e21..bc5754cdb 100644
--- a/megatron/data/data_utils.py
+++ b/megatron/data/data_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
index c222fb3da..75e601fda 100644
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index 830326c3f..9b062b050 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -1,6 +1,6 @@
 /*
  coding=utf-8
- Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 81d99b0cd..8165205b9 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Copyright (c) Facebook, Inc. and its affiliates.
diff --git a/megatron/data/samplers.py b/megatron/data/samplers.py
index 5e14b4a78..a9428e41c 100644
--- a/megatron/data/samplers.py
+++ b/megatron/data/samplers.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/devutil.py b/megatron/devutil.py
new file mode 100644
index 000000000..7563d7dcf
--- /dev/null
+++ b/megatron/devutil.py
@@ -0,0 +1,51 @@
+import torch.cuda
+
+
+class Metric:
+    """
+    Dumb utility to collect and report average wall-time metrics.
+    """
+
+    def __init__(self, label):
+        self.label = label
+        self.measurements = []
+
+    def collect(self, measurement):
+        self.measurements.append(measurement)
+
+    def get_measurements(self):
+        return self.measurements[:]
+
+    def report(self):
+        print(
+            self.label,
+            torch.quantile(torch.tensor(self.measurements), torch.arange(10) / 10.0),
+        )
+
+
+def monitor_method_cuda_wall_times(metric, obj, methodname):
+    """
+    Measure timings for a method on an object or class.
+
+    For instance:
+
+    >>> metric = Metric('!LNORM')
+    >>> monitor_method_wall_times(metric, LayerNorm, 'forward')
+    """
+    oldmeth = getattr(obj, methodname)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    def newmeth(*args, **kw):
+        start_event.record()
+        try:
+            return oldmeth(*args, **kw)
+        finally:
+            end_event.record()
+            torch.cuda.synchronize()
+            elapsed = start_event.elapsed_time(end_event)
+            metric.collect(elapsed)
+            metric.report()
+
+    setattr(obj, methodname, newmeth)
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index fff97986a..f6ac063ce 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/compat.h b/megatron/fused_kernels/compat.h
index 251337d68..88867dc7e 100644
--- a/megatron/fused_kernels/compat.h
+++ b/megatron/fused_kernels/compat.h
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/fused_rotary_positional_embedding.cpp b/megatron/fused_kernels/fused_rotary_positional_embedding.cpp
index 878f1b2c0..e1a77de2b 100644
--- a/megatron/fused_kernels/fused_rotary_positional_embedding.cpp
+++ b/megatron/fused_kernels/fused_rotary_positional_embedding.cpp
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/fused_rotary_positional_embedding.h b/megatron/fused_kernels/fused_rotary_positional_embedding.h
index cd67a28d3..aafd5104d 100644
--- a/megatron/fused_kernels/fused_rotary_positional_embedding.h
+++ b/megatron/fused_kernels/fused_rotary_positional_embedding.h
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/fused_rotary_positional_embedding_cuda.cu b/megatron/fused_kernels/fused_rotary_positional_embedding_cuda.cu
index 0f37dff41..6b54662bc 100644
--- a/megatron/fused_kernels/fused_rotary_positional_embedding_cuda.cu
+++ b/megatron/fused_kernels/fused_rotary_positional_embedding_cuda.cu
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_masked_softmax.cpp b/megatron/fused_kernels/scaled_masked_softmax.cpp
index b7c162c78..6a210cc37 100644
--- a/megatron/fused_kernels/scaled_masked_softmax.cpp
+++ b/megatron/fused_kernels/scaled_masked_softmax.cpp
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_masked_softmax.h b/megatron/fused_kernels/scaled_masked_softmax.h
index 977e59481..a594a13c1 100644
--- a/megatron/fused_kernels/scaled_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_masked_softmax.h
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 757850d2b..26c2d1820 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
index 945c48c43..cedd649a2 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index b0757196b..b48afeee4 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 7ced78acd..99a52abd5 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -1,5 +1,5 @@
 /* coding=utf-8
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/megatron/fused_kernels/setup.py b/megatron/fused_kernels/setup.py
index d800b2a00..528a20676 100644
--- a/megatron/fused_kernels/setup.py
+++ b/megatron/fused_kernels/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/gradient_noise_scale/gradient_noise_scale.py b/megatron/gradient_noise_scale/gradient_noise_scale.py
index 9c518a9cd..71076ffc3 100644
--- a/megatron/gradient_noise_scale/gradient_noise_scale.py
+++ b/megatron/gradient_noise_scale/gradient_noise_scale.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/initialize.py b/megatron/initialize.py
index dd752659b..72779b094 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index f0fcca27e..9db951aa0 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/logging.py b/megatron/logging.py
index ac404da85..6c9b7915e 100644
--- a/megatron/logging.py
+++ b/megatron/logging.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI.
+# Copyright (c) 2024, EleutherAI.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py
index 9af46de95..619b4c33d 100755
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -1,7 +1,7 @@
 #
-# Copyright 2021 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
+# Copyright 2024 Biderman et al. This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/activations.py b/megatron/model/activations.py
index 5c4ba1d5a..7a29b0716 100644
--- a/megatron/model/activations.py
+++ b/megatron/model/activations.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/flash_attention.py b/megatron/model/flash_attention.py
deleted file mode 100644
index d446f0a51..000000000
--- a/megatron/model/flash_attention.py
+++ /dev/null
@@ -1,831 +0,0 @@
-# Based on: https://github.com/HazyResearch/flash-attention/blob/4a6eaa9f27df6fff7ffb2c24e894938a687dd870/flash_attn/flash_attn_interface.py
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from flash_attn import flash_attn_triton
-import flash_attn_2_cuda as flash_attn_cuda  # For flash_attn version 2.1.1
-
-
-def flash_attn_unpadded_unpacked_func_triton(
-    q, k, v, bias=None, causal=False, softmax_scale=None
-):
-    return flash_attn_triton.flash_attn_func(q, k, v, bias, causal, softmax_scale)
-
-
-def _flash_attn_forward_cuda(
-    q,
-    k,
-    v,
-    out,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    return_softmax,
-    num_splits=0,
-    generator=None,
-):
-    """
-    num_splits: how much to parallelize over the seqlen_q dimension. num_splits=0 means
-    it will be set by an internal heuristic. We're exposing num_splits mostly for benchmarking.
-    Don't change it unless you know what you're doing.
-    """
-    softmax_lse, *rest = flash_attn_cuda.fwd(
-        q,
-        k,
-        v,
-        out,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        return_softmax,
-        num_splits,
-        generator,
-    )
-    # if out.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    S_dmask = rest[0] if return_softmax else None
-    return out, softmax_lse, S_dmask
-
-
-def _flash_attn_backward_cuda(
-    dout,
-    q,
-    k,
-    v,
-    out,
-    softmax_lse,
-    dq,
-    dk,
-    dv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    num_splits=0,
-    generator=None,
-):
-    """
-    num_splits: whether to parallelize over the seqlen_k dimension (num_splits > 1) or
-    not (num_splits = 1). num_splits=0 means it will be set by an internal heuristic.
-    Any value above 1 will call the same kernel (i.e. num_splits=2 would call the same kernel
-    as num_splits=3), so effectively the choices are 0, 1, and 2.
-    This hyperparameter can be tuned for performance, but default value (heuristic) should work fine.
-    """
-    _, _, _, softmax_d = flash_attn_cuda.bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        num_splits,
-        generator,
-    )
-    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dq, dk, dv, softmax_d
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cu_seqlens,
-        max_seqlen,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            torch.empty_like(qkv[:, 0]),
-            cu_seqlens,
-            cu_seqlens,
-            max_seqlen,
-            max_seqlen,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(qkv, out, softmax_lse, cu_seqlens, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen = max_seqlen
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        qkv, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dqkv = torch.empty_like(qkv)
-        _flash_attn_backward_cuda(
-            dout,
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            out,
-            softmax_lse,
-            dqkv[:, 0],
-            dqkv[:, 1],
-            dqkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            ctx.max_seqlen,
-            ctx.max_seqlen,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_qkvpacked_func_cuda(
-    qkv,
-    cu_seqlens,
-    max_seqlen,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    return FlashAttnQKVPackedFunc.apply(
-        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
-    )
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            torch.empty_like(q),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(
-            q, kv, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            kv,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dq = torch.empty_like(q)
-        dkv = torch.empty_like(kv)
-        _flash_attn_backward_cuda(
-            dout,
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            out,
-            softmax_lse,
-            dq,
-            dkv[:, 0],
-            dkv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dq, dkv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_kvpacked_func_cuda(
-    q,
-    kv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        kv: (total_k, 2, nheads, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnKVPackedFunc.apply(
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
-
-
-class FlashAttnFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        k,
-        v,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        # Save rng_state because the backward pass will regenerate the dropout mask
-        rng_state = torch.cuda.get_rng_state() if dropout_p > 0 else None
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        out, softmax_lse, S_dmask = _flash_attn_forward_cuda(
-            q,
-            k,
-            v,
-            torch.empty_like(q),
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax,
-        )
-        ctx.save_for_backward(
-            q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        if rng_state is not None:
-            cur_rng_state = torch.cuda.get_rng_state()
-            torch.cuda.set_rng_state(rng_state)
-        dq, dk, dv = torch.empty_like(q), torch.empty_like(k), torch.empty_like(v)
-        _flash_attn_backward_cuda(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dq,
-            dk,
-            dv,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-        )
-        if rng_state is not None:
-            torch.cuda.set_rng_state(cur_rng_state)
-        return dq, dk, dv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_unpadded_func_cuda(
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        k: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
-        v: (total_k, nheads, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnFunc.apply(
-        q,
-        k,
-        v,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
-
-
-# For flash-attention 2 integration
-def _flash_attn_varlen_forward(
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    return_softmax,
-):
-    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
-    q, k, v = [maybe_contiguous(x) for x in (q, k, v)]
-    (
-        out,
-        q,
-        k,
-        v,
-        out_padded,
-        softmax_lse,
-        S_dmask,
-        rng_state,
-    ) = flash_attn_cuda.varlen_fwd(
-        q,
-        k,
-        v,
-        None,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        return_softmax,
-        None,
-    )
-    # if out.isnan().any() or softmax_lse.isnan().any():
-    #     breakpoint()
-    return out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state
-
-
-def _flash_attn_varlen_backward(
-    dout,
-    q,
-    k,
-    v,
-    out,
-    softmax_lse,
-    dq,
-    dk,
-    dv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p,
-    softmax_scale,
-    causal,
-    rng_state=None,
-):
-    maybe_contiguous = lambda x: x.contiguous() if x.stride(-1) != 1 else x
-    # dq, dk, dv are allocated by us so they should already be contiguous
-    dout, q, k, v, out = [maybe_contiguous(x) for x in (dout, q, k, v, out)]
-    dq, dk, dv, softmax_d, = flash_attn_cuda.varlen_bwd(
-        dout,
-        q,
-        k,
-        v,
-        out,
-        softmax_lse,
-        dq,
-        dk,
-        dv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        False,
-        causal,
-        None,
-        rng_state,
-    )
-    # if dk.isnan().any() or dk.isnan().any() or dv.isnan().any() or softmax_d.isnan().any():
-    #     breakpoint()
-    return dq, dk, dv, softmax_d
-
-
-class FlashAttnVarlenQKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        qkv,
-        cu_seqlens,
-        max_seqlen,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        if softmax_scale is None:
-            softmax_scale = qkv.shape[-1] ** (-0.5)
-        (
-            out,
-            q,
-            k,
-            v,
-            out_padded,
-            softmax_lse,
-            S_dmask,
-            rng_state,
-        ) = _flash_attn_varlen_forward(
-            qkv[:, 0],
-            qkv[:, 1],
-            qkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            max_seqlen,
-            max_seqlen,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax and dropout_p > 0,
-        )
-        ctx.save_for_backward(q, k, v, out_padded, softmax_lse, cu_seqlens, rng_state)
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen = max_seqlen
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        q, k, v, out, softmax_lse, cu_seqlens, rng_state = ctx.saved_tensors
-        qkv_shape = q.shape[:-2] + (3, *q.shape[-2:])
-        dqkv = torch.empty(qkv_shape, dtype=q.dtype, device=q.device)
-        _flash_attn_varlen_backward(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dqkv[:, 0],
-            dqkv[:, 1],
-            dqkv[:, 2],
-            cu_seqlens,
-            cu_seqlens,
-            ctx.max_seqlen,
-            ctx.max_seqlen,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-            rng_state=rng_state,
-        )
-        dqkv = dqkv[..., : dout.shape[-1]]  # We could have padded the head dimension
-        return dqkv, None, None, None, None, None, None
-
-
-def flash_attn_varlen_qkvpacked_func(
-    qkv,
-    cu_seqlens,
-    max_seqlen,
-    dropout_p=0.0,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    If Q, K, V are already stacked into 1 tensor, this function will be faster than
-    calling flash_attn_varlen_func on Q, K, V since the backward pass avoids explicit concatenation
-    of the gradients of Q, K, V.
-    For multi-query and grouped-query attention (MQA/GQA), please see
-    flash_attn_varlen_kvpacked_func and flash_attn_varlen_func.
-
-    Arguments:
-        qkv: (total, 3, nheads, headdim), where total = total number of tokens in the batch.
-        cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into qkv.
-        max_seqlen: int. Maximum sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnVarlenQKVPackedFunc.apply(
-        qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale, causal, return_attn_probs
-    )
-
-
-class FlashAttnVarlenKVPackedFunc(torch.autograd.Function):
-    @staticmethod
-    def forward(
-        ctx,
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_softmax,
-    ):
-        if softmax_scale is None:
-            softmax_scale = q.shape[-1] ** (-0.5)
-        (
-            out,
-            q,
-            k,
-            v,
-            out_padded,
-            softmax_lse,
-            S_dmask,
-            rng_state,
-        ) = _flash_attn_varlen_forward(
-            q,
-            kv[:, 0],
-            kv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            dropout_p,
-            softmax_scale,
-            causal=causal,
-            return_softmax=return_softmax and dropout_p > 0,
-        )
-        ctx.save_for_backward(
-            q, k, v, out_padded, softmax_lse, cu_seqlens_q, cu_seqlens_k, rng_state
-        )
-        ctx.dropout_p = dropout_p
-        ctx.max_seqlen_q = max_seqlen_q
-        ctx.max_seqlen_k = max_seqlen_k
-        ctx.softmax_scale = softmax_scale
-        ctx.causal = causal
-        return out if not return_softmax else (out, softmax_lse, S_dmask)
-
-    @staticmethod
-    def backward(ctx, dout, *args):
-        (
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            rng_state,
-        ) = ctx.saved_tensors
-        dq = torch.empty_like(q)
-        kv_shape = k.shape[:-2] + (2, *k.shape[-2:])
-        dkv = torch.empty(kv_shape, dtype=k.dtype, device=k.device)
-        _flash_attn_varlen_backward(
-            dout,
-            q,
-            k,
-            v,
-            out,
-            softmax_lse,
-            dq,
-            dkv[:, 0],
-            dkv[:, 1],
-            cu_seqlens_q,
-            cu_seqlens_k,
-            ctx.max_seqlen_q,
-            ctx.max_seqlen_k,
-            ctx.dropout_p,
-            ctx.softmax_scale,
-            ctx.causal,
-            rng_state=rng_state,
-        )
-        dq = dq[..., : dout.shape[-1]]  # We could have padded the head dimension
-        dkv = dkv[..., : dout.shape[-1]]
-        return dq, dkv, None, None, None, None, None, None, None, None
-
-
-def flash_attn_varlen_kvpacked_func(
-    q,
-    kv,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    dropout_p=0.0,
-    softmax_scale=None,
-    causal=False,
-    return_attn_probs=False,
-):
-    """dropout_p should be set to 0.0 during evaluation
-    If K, V are already stacked into 1 tensor, this function will be faster than
-    calling flash_attn_func on Q, K, V since the backward pass avoids explicit concatenation
-    of the gradients of K, V.
-    Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads
-    than Q. Note that the number of heads in Q must be divisible by the number of heads in KV.
-    For example, if Q has 6 heads and K, V have 2 heads, head 0, 1, 2 of Q will attention to head
-    0 of K, V, and head 3, 4, 5 of Q will attention to head 1 of K, V.
-
-    If causal=True, the causal mask is aligned to the bottom right corner of the attention matrix.
-    For example, if seqlen_q = 2 and seqlen_k = 5, the causal mask (1 = keep, 0 = masked out) is:
-        1 1 1 1 0
-        1 1 1 1 1
-    If seqlen_q = 5 and seqlen_k = 2, the causal mask is:
-        0 0
-        0 0
-        0 0
-        1 0
-        1 1
-    If the row of the mask is all zero, the output will be zero.
-
-    Arguments:
-        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
-        kv: (total_k, 2, nheads_k, headdim), where total_k = total number of key tokens in the batch.
-        cu_seqlens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into q.
-        cu_seqlens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-           of the sequences in the batch, used to index into kv.
-        max_seqlen_q: int. Maximum query sequence length in the batch.
-        max_seqlen_k: int. Maximum key sequence length in the batch.
-        dropout_p: float. Dropout probability.
-        softmax_scale: float. The scaling of QK^T before applying softmax.
-            Default to 1 / sqrt(headdim).
-        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
-        return_attn_probs: bool. Whether to return the attention probabilities. This option is for
-           testing only. The returned probabilities are not guaranteed to be correct
-           (they might not have the right scaling).
-    Return:
-        out: (total, nheads, headdim).
-        softmax_lse [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen). The
-            logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax
-            normalization factor).
-        S_dmask [optional, if return_attn_probs=True]: (batch_size, nheads, seqlen, seqlen).
-            The output of softmax (possibly with different scaling). It also encodes the dropout
-            pattern (negative means that location was dropped, nonnegative means it was kept).
-    """
-    return FlashAttnVarlenKVPackedFunc.apply(
-        q,
-        kv,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        dropout_p,
-        softmax_scale,
-        causal,
-        return_attn_probs,
-    )
diff --git a/megatron/model/fused_bias_dropout.py b/megatron/model/fused_bias_dropout.py
index afe1f0719..8618a2a7e 100644
--- a/megatron/model/fused_bias_dropout.py
+++ b/megatron/model/fused_bias_dropout.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI contributors
+# Copyright (c) 2024, EleutherAI contributors
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
new file mode 100644
index 000000000..d33ded506
--- /dev/null
+++ b/megatron/model/fused_layer_norm.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""This code is copied from NVIDIA apex:
+      https://github.com/NVIDIA/apex
+   with some changes. """
+
+import numbers
+import torch
+from torch.nn.parameter import Parameter
+from torch.nn import init
+import importlib
+from torch.nn import functional as F
+import inspect
+
+from megatron.utils import make_viewless_tensor
+
+try:
+    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
+
+    HAVE_PERSIST_LAYER_NORM = True
+except:
+    HAVE_PERSIST_LAYER_NORM = False
+
+from apex.normalization.fused_layer_norm import FusedLayerNormAffineFunction
+
+
+global fused_layer_norm_cuda
+fused_layer_norm_cuda = None
+
+
+class MixedFusedLayerNorm(torch.nn.Module):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-5,
+        no_persist_layer_norm=True,
+        sequence_parallel=False,
+        apply_layernorm_1p=False,
+        mem_efficient_ln=True,
+    ):
+        super(MixedFusedLayerNorm, self).__init__()
+
+        self.apply_layernorm_1p = apply_layernorm_1p
+        self.mem_efficient_ln = mem_efficient_ln
+
+        global fused_layer_norm_cuda
+        fused_layer_norm_cuda = importlib.import_module("fused_layer_norm_cuda")
+
+        # List of hiddens sizes supported in the persistent layer norm kernel
+        # If the hidden size is not supported, fall back to the non-persistent
+        # kernel.
+        persist_ln_hidden_sizes = [
+            1024,
+            1536,
+            2048,
+            2304,
+            3072,
+            3840,
+            4096,
+            5120,
+            6144,
+            8192,
+            10240,
+            12288,
+            12800,
+            15360,
+            16384,
+            18432,
+            20480,
+            24576,
+            25600,
+            30720,
+            32768,
+            40960,
+            49152,
+            65536,
+        ]
+        if (
+            normalized_shape not in persist_ln_hidden_sizes
+            or not HAVE_PERSIST_LAYER_NORM
+        ):
+            no_persist_layer_norm = True
+
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = (normalized_shape,)
+        self.normalized_shape = torch.Size(normalized_shape)
+        self.eps = eps
+        self.weight = Parameter(torch.Tensor(*normalized_shape))
+        self.bias = Parameter(torch.Tensor(*normalized_shape))
+        self.reset_parameters()
+        self.no_persist_layer_norm = no_persist_layer_norm
+        self.sequence_parallel = sequence_parallel
+
+        # set sequence parallelism flag on weight and bias parameters
+        setattr(self.weight, "sequence_parallel", self.sequence_parallel)
+        setattr(self.bias, "sequence_parallel", self.sequence_parallel)
+
+    def reset_parameters(self):
+
+        if self.apply_layernorm_1p:
+            init.zeros_(self.weight)
+            init.zeros_(self.bias)
+        else:
+            init.ones_(self.weight)
+            init.zeros_(self.bias)
+
+    def forward(self, input):
+
+        weight = self.weight + 1 if self.apply_layernorm_1p else self.weight
+        # CPU path is here for unittest sake.
+        if not input.is_cuda:
+            print(
+                "WARNING! The input of FusedLayerNorm should be on the GPU."
+                "This warning should only be triggered in the FusedLayerNorm unit tests."
+            )
+            return F.layer_norm(
+                input, self.normalized_shape, weight, self.bias, self.eps
+            )
+
+        if self.no_persist_layer_norm:
+            # Apex does not have versions yet (https://github.com/NVIDIA/apex/pull/1648), so we need to inspect
+            # the function manually on whether the extra arg introduced in https://github.com/NVIDIA/apex/pull/1715 exists yet
+            if (
+                "memory_efficient"
+                in inspect.getfullargspec(FusedLayerNormAffineFunction.forward).args
+            ):
+                return FusedLayerNormAffineFunction.apply(
+                    input,
+                    weight,
+                    self.bias,
+                    self.normalized_shape,
+                    self.eps,
+                    self.mem_efficient_ln,
+                )
+            else:
+                return FusedLayerNormAffineFunction.apply(
+                    input, weight, self.bias, self.normalized_shape, self.eps
+                )
+        else:
+            output = FastLayerNormFN.apply(input, weight, self.bias, self.eps)
+
+            # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+            # a populated '_base' field). This will result in schedule.py's
+            # deallocate_output_tensor() throwing an error, so a viewless tensor is
+            # created to prevent this.
+            output = make_viewless_tensor(
+                inp=output, requires_grad=input.requires_grad, keep_graph=True
+            )
+
+            return output
diff --git a/megatron/model/fused_rope.py b/megatron/model/fused_rope.py
index 9f63f1537..94e96253d 100644
--- a/megatron/model/fused_rope.py
+++ b/megatron/model/fused_rope.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 78f2992ad..bce2e1992 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/gmlp.py b/megatron/model/gmlp.py
index e4e29dad5..c3462c651 100644
--- a/megatron/model/gmlp.py
+++ b/megatron/model/gmlp.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
index ce9c39da9..e083351cc 100644
--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021 EleutherAI
+# Copyright (c) 2024 EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
index 11bcdc310..ad8ebc05a 100644
--- a/megatron/model/init_functions.py
+++ b/megatron/model/init_functions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/norms.py b/megatron/model/norms.py
index ddb45c3f3..4d01dde5a 100644
--- a/megatron/model/norms.py
+++ b/megatron/model/norms.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 
 import torch
 from torch.nn import LayerNorm as LayerNorm
+from .fused_layer_norm import MixedFusedLayerNorm
 
 
 def get_norm(neox_args):
@@ -22,7 +23,7 @@ def get_norm(neox_args):
         eps = neox_args.rms_norm_epsilon
     elif neox_args.norm == "layernorm":
         eps = neox_args.layernorm_epsilon
-        norm = LayerNorm
+        norm = MixedFusedLayerNorm if neox_args.layernorm_fusion else LayerNorm
     elif neox_args.norm == "scalenorm":
         eps = neox_args.scalenorm_epsilon
         norm = ScaleNorm
diff --git a/megatron/model/positional_embeddings.py b/megatron/model/positional_embeddings.py
index 29c8a7e65..16cd22ad8 100644
--- a/megatron/model/positional_embeddings.py
+++ b/megatron/model/positional_embeddings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py
index 16d9213aa..34ee5ea6b 100644
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -54,8 +54,10 @@
 """ We use the following notation throughout this file:
      h: hidden size
      n: number of attention heads
+     kv: number of key or value heads
      p: number of model parallel partitions
      np: n/p
+     kvp: kv/p
      hp: h/p
      hn: h/n
      b: batch size
@@ -177,8 +179,13 @@ def __init__(
 
         self.multiple_of = multiple_of
 
-        ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
-        ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+        # Allow custom intermediate size, e.g. for Mistral
+        if neox_args.intermediate_size is not None:
+            ff_dim = neox_args.intermediate_size
+        else:
+            ff_dim = int(2 * neox_args.hidden_size * 4 / 3)
+            ff_dim = self.multiple_of * ((ff_dim + multiple_of - 1) // multiple_of)
+
         self.w1 = mpu.ColumnParallelLinear(
             neox_args=neox_args,
             input_size=neox_args.hidden_size,
@@ -307,6 +314,7 @@ def __init__(
             neox_args.num_attention_heads, world_size
         )
         self.pos_emb = neox_args.pos_emb
+
         self.use_qk_layernorm = neox_args.use_qk_layernorm
         if self.use_qk_layernorm:
             norm, eps = get_norm(neox_args)
@@ -318,15 +326,46 @@ def __init__(
                 eps=eps,
             )
 
-        # Strided linear layer.
-        self.query_key_value = mpu.ColumnParallelLinear(
-            neox_args=neox_args,
-            input_size=neox_args.hidden_size,
-            output_size=3 * neox_args.hidden_size,
-            gather_output=False,
-            init_method=init_method,
-            bias=neox_args.use_bias_in_attn_linear,
-        )
+        self.sliding_window_width = neox_args.sliding_window_width
+
+        if (
+            not neox_args.num_kv_heads
+            or neox_args.num_kv_heads == neox_args.num_attention_heads
+        ):
+            self.gqa = False
+        else:
+            self.gqa = True
+        if self.gqa:
+            self.num_kv_heads_per_partition = mpu.divide(
+                neox_args.num_kv_heads, world_size
+            )  # we do not yet clone KV heads in MQA across TP ranks...
+            self.kv_hidden_size = (
+                neox_args.num_kv_heads * self.hidden_size_per_attention_head
+            )  # how large the total hidden dim for each of K and V is
+        else:
+            self.num_kv_heads_per_partition = self.num_attention_heads_per_partition
+            self.kv_hidden_size = neox_args.hidden_size
+
+        if not self.gqa:
+            # Strided linear layer.
+            self.query_key_value = mpu.ColumnParallelLinear(
+                neox_args=neox_args,
+                input_size=neox_args.hidden_size,
+                output_size=3 * neox_args.hidden_size,
+                gather_output=False,
+                init_method=init_method,
+                bias=neox_args.use_bias_in_attn_linear,
+            )
+        else:
+            # QKV proj is smaller if we are using GQA / MQA
+            self.query_key_value = mpu.ColumnParallelLinear(
+                neox_args=neox_args,
+                input_size=neox_args.hidden_size,
+                output_size=neox_args.hidden_size + 2 * self.kv_hidden_size,
+                gather_output=False,
+                init_method=init_method,
+                bias=neox_args.use_bias_in_attn_linear,
+            )
 
         coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -373,6 +412,10 @@ def __init__(
         self.attention_type = neox_args.attention_config[layer_number]
         self.use_flash_attention = self.attention_type == "flash"
         self.sparse = self.attention_type not in ("global", "flash")
+
+        if self.gqa:
+            assert not self.sparse
+
         if self.sparse:
             self.sparse_attn = configure_sparse_attention(
                 neox_args,
@@ -382,18 +425,21 @@ def __init__(
             )
         else:
             if self.use_flash_attention:
-                from megatron.model.flash_attention import (
-                    # flash_attn_unpadded_qkvpacked_func_cuda,
-                    # flash_attn_unpadded_kvpacked_func_cuda,
-                    # Change of function names going from flash attention 1 -> flash attention 2
-                    flash_attn_varlen_qkvpacked_func,
-                    flash_attn_varlen_kvpacked_func,
-                    flash_attn_unpadded_unpacked_func_triton,
+                # we now use Flash Attention 2's provided interface.
+                # TODO: we no longer need to use flash_triton_fn since flash cuda supports alibi.
+                # consider adding OpenAI's more recent Flash-2 Triton kernel in future
+                # from https://github.com/openai/triton/blob/main/python/tutorials/06-fused-attention.py
+                from flash_attn.flash_attn_interface import (
+                    flash_attn_func,
+                    flash_attn_varlen_func,
+                )
+                from flash_attn.flash_attn_triton import (
+                    flash_attn_func as flash_attn_unpadded_unpacked_func_triton,
                 )
 
                 self.flash_triton_fn = flash_attn_unpadded_unpacked_func_triton
-                self.flash_qkv_fn = flash_attn_varlen_qkvpacked_func
-                self.flash_kv_fn = flash_attn_varlen_kvpacked_func
+                self.flash_qkv_fn = flash_attn_func
+                self.flash_varlen_qkv_fn = flash_attn_varlen_func
             else:
                 self.scale_mask_softmax = FusedScaleMaskSoftmax(
                     input_in_fp16=self.fp16,
@@ -436,13 +482,11 @@ def attention(
             query_layer.size(0),
             key_layer.size(0),
         )
-
         # [sq, b, np, hn] -> [sq, b * np, hn]
         query_layer = query_layer.view(
             output_size[2], output_size[0] * output_size[1], -1
         )
         key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
-
         # preallocating result tensor: [b * np, sq, sk]
         matmul_result = torch.empty(
             output_size[0] * output_size[1],
@@ -463,7 +507,6 @@ def attention(
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
-
         # ==================================================
         # Update attention mask for inference. [b, np, sq, sk]
         # ==================================================
@@ -538,10 +581,10 @@ def flash_attention(self, query_layer, key_layer, value_layer):
 
             # [sk, b, np, hn] -> [b, sk, np, hn] -> [b * sk, 1, np, hn]
             key_layer = key_layer.transpose(0, 1).reshape(
-                output_size[0] * output_size[3], 1, output_size[1], -1
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
             )
             value_layer = value_layer.transpose(0, 1).reshape(
-                output_size[0] * output_size[3], 1, output_size[1], -1
+                output_size[0], output_size[3], self.num_kv_heads_per_partition, -1
             )
 
             batch_size = output_size[0]
@@ -564,51 +607,55 @@ def flash_attention(self, query_layer, key_layer, value_layer):
                 device=key_layer.device,
             )
 
-            if not self.training:
-
-                # [sq, b, np, hn] -> [b * sq, np, hn]
-                query_layer = query_layer.transpose(0, 1).reshape(
-                    output_size[0] * output_size[2], output_size[1], -1
-                )
-
-                # Combined k/v into [b * sk, 2, np, hn].
-                kv = torch.cat([key_layer, value_layer], dim=1)
+            # [sq, b, np, hn] -> [b, sq, np, hn]
+            query_layer = query_layer.transpose(0, 1).reshape(
+                output_size[0], output_size[2], output_size[1], -1
+            )
 
-                output = self.flash_kv_fn(
-                    query_layer,
-                    kv,
+            # only pass in window_size kwarg to flash-attn
+            # if we use Sliding Window Attention.
+            # Flash attn defaults to (-1,-1), or
+            # does not have this kwarg prior to v2.3.0
+            extra_kwargs = (
+                {"window_size": (self.sliding_window_width, -1)}
+                if self.sliding_window_width is not None
+                else {}
+            )
+            if not self.training:
+                q_shape = query_layer.shape
+                k_shape = key_layer.shape
+                v_shape = value_layer.shape
+                output = self.flash_varlen_qkv_fn(
+                    query_layer.reshape(
+                        (q_shape[0] * q_shape[1], q_shape[2], q_shape[3])
+                    ),
+                    key_layer.reshape(
+                        (k_shape[0] * k_shape[1], k_shape[2], k_shape[3])
+                    ),
+                    value_layer.reshape(
+                        (v_shape[0] * v_shape[1], v_shape[2], v_shape[3])
+                    ),
                     cu_seqlens_q,
                     cu_seqlens_k,
                     max_seqlen_q,
                     max_seqlen_k,
-                    self.dropout_p if self.training else 0.0,
                     softmax_scale=None,
                     causal=True,
+                    **extra_kwargs,
                 )
-
+                output = output.reshape(q_shape)
             else:
-
-                # [sq, b, np, hn] -> [b * sq, 1, np, hn]
-                query_layer = query_layer.transpose(0, 1).reshape(
-                    output_size[0] * output_size[2], 1, output_size[1], -1
-                )
-
-                # Combined q/k/v into [b * s, 3, np, hn].
-                qkv = torch.cat([query_layer, key_layer, value_layer], dim=1)
-
                 output = self.flash_qkv_fn(
-                    qkv,
-                    cu_seqlens_q,
-                    max_seqlen_q,
+                    query_layer,
+                    key_layer,
+                    value_layer,
                     self.dropout_p if self.training else 0.0,
                     softmax_scale=None,
                     causal=True,
+                    **extra_kwargs,
                 )
 
-            # [b * sq, np, hn] -> [b, sq, np, hn]
-            matmul_result = output.view(
-                output_size[0], output_size[2], output.shape[1], output.shape[2]
-            )
+            matmul_result = output
             # [b, sq, np, hn] -> [b, np, sq, hn]
             matmul_result = matmul_result.transpose(1, 2)
 
@@ -650,6 +697,103 @@ def sparse_attention(self, query_layer, key_layer, value_layer, attention_mask):
             query_layer, key_layer, value_layer, attn_mask=attn_mask, rpe=rpe
         )
 
+    def gqa_project(self, hidden_states, attention_mask, layer_past=None):
+        # QKV projection and separation into separate Q/K/V layers for GQA,
+        # where KV projections may be smaller than Q projection.
+        # the logic for this is explained in comments of this function
+        # detailing the intermediate sizes of tensors at each reshape.
+
+        # pass through projection: [sq, b, h] --> [sq, b, ((np + 2 * kvp) * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # First: reshape so we have seqlen, batch, and num. query heads each as separate dims
+        # Final dim is not exactly head dim: the first (head dim) dims are query heads,
+        # The last (head dim * ratio of kv to q heads) each are the "k/v heads"
+        # (right now we treat like we have same num. heads, but smaller head dim)
+
+        # [sq, b, ((np + 2 * kvp) * hn)] --> [sq, b, np, (hn * (1 + 2 * (kvp / np)))]
+        new_qkv_shape = (
+            mixed_x_layer.shape[0],
+            mixed_x_layer.shape[1],
+            self.num_attention_heads_per_partition,
+            int(
+                self.hidden_size_per_attention_head
+                * (
+                    1
+                    + 2
+                    * (
+                        self.num_kv_heads_per_partition
+                        / self.num_attention_heads_per_partition
+                    )
+                )
+            ),
+        )
+        mixed_x_layer = mixed_x_layer.reshape(*new_qkv_shape)
+
+        # Next: split our fake head dim. (last dim) so that the first (head dim) dimensions go to Q,
+        # the last smaller 2 * (head dim * kv to q head ratio) each divided between K and V separately
+        split_sizes = (
+            self.hidden_size_per_attention_head,
+            int(
+                (
+                    self.num_kv_heads_per_partition
+                    / self.num_attention_heads_per_partition
+                )
+                * self.hidden_size_per_attention_head
+            ),
+            int(
+                (
+                    self.num_kv_heads_per_partition
+                    / self.num_attention_heads_per_partition
+                )
+                * self.hidden_size_per_attention_head
+            ),
+        )
+
+        # [sq, b, np, (hn * (1 + 2 * (kvp / np)))] --> 1 x [sq, b, np, hn] , 2 x [sq, b, np, (hn * (kvp / np))]
+        (query_layer, key_layer, value_layer) = [
+            x.contiguous()
+            for x in torch.split(
+                mixed_x_layer,
+                split_sizes,
+                dim=mixed_x_layer.dim() - 1,
+            )
+        ]
+
+        # reshape K/V to proper output shape (last dim = correct full "real" head size again)
+        # 2 x [sq, b, np, (hn * (kvp / np))] --> 2 x [sq, b, kvp, hn]
+        new_kv_shape = (
+            key_layer.size(0),
+            key_layer.size(1),
+            self.num_kv_heads_per_partition,
+            self.hidden_size_per_attention_head,
+        )
+
+        key_layer = key_layer.view(*new_kv_shape)
+
+        value_layer = value_layer.view(*new_kv_shape)
+
+        # if not using Flash attention, we repeat K/V heads to match Q head counts
+        if not self.use_flash_attention:
+            key_layer = torch.repeat_interleave(
+                key_layer,
+                repeats=int(
+                    self.num_attention_heads_per_partition
+                    // self.num_kv_heads_per_partition
+                ),
+                dim=2,
+            )
+            value_layer = torch.repeat_interleave(
+                value_layer,
+                repeats=int(
+                    self.num_attention_heads_per_partition
+                    // self.num_kv_heads_per_partition
+                ),
+                dim=2,
+            )
+
+        return query_layer, key_layer, value_layer
+
     def forward(self, hidden_states, attention_mask, layer_past=None):
 
         # hidden_states: [sq, b, h]
@@ -658,20 +802,31 @@ def forward(self, hidden_states, attention_mask, layer_past=None):
         # Query, Key, and Value
         # =====================
 
-        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
-        mixed_x_layer, _ = self.query_key_value(hidden_states)
+        if not self.gqa:
+            # QKV projection for MHA.
 
-        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-        new_tensor_shape = mixed_x_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+            mixed_x_layer, _ = self.query_key_value(hidden_states)
 
-        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-        (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
-            mixed_x_layer, 3
-        )
+            # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = mpu.split_tensor_along_last_dim(
+                mixed_x_layer, 3
+            )
+        else:
+            # Grouped Query Attention (GQA) - specific logic for performing QKV proj
+            # and separating out Q, K, and V outputs.
+
+            # output shapes: 1 x [sq, b, np, hn], 2 x [sq, b, kvp, hn] if using flash
+            query_layer, key_layer, value_layer = self.gqa_project(
+                hidden_states, attention_mask, layer_past=layer_past
+            )
 
         # QK Normalization https://arxiv.org/abs/2302.05442
         if self.use_qk_layernorm:
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
index 450695934..95a6c48c9 100644
--- a/megatron/model/utils.py
+++ b/megatron/model/utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021 EleutherAI
+# Copyright (c) 2024 EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
index 488baf042..f7372bc55 100644
--- a/megatron/model/word_embeddings.py
+++ b/megatron/model/word_embeddings.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/__init__.py b/megatron/mpu/__init__.py
index b20a10850..2365507d9 100644
--- a/megatron/mpu/__init__.py
+++ b/megatron/mpu/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/cross_entropy.py b/megatron/mpu/cross_entropy.py
index d28e0cc6b..d9f04703d 100644
--- a/megatron/mpu/cross_entropy.py
+++ b/megatron/mpu/cross_entropy.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py
index 7b937053b..87e2a9615 100644
--- a/megatron/mpu/data.py
+++ b/megatron/mpu/data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/initialize.py b/megatron/mpu/initialize.py
index d604627c2..19d231524 100644
--- a/megatron/mpu/initialize.py
+++ b/megatron/mpu/initialize.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/layers.py b/megatron/mpu/layers.py
index 8bf1dd224..0d14806ac 100644
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/mappings.py b/megatron/mpu/mappings.py
index c14693a69..e09462eca 100644
--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/random.py b/megatron/mpu/random.py
index f93a912a8..77fe80b3e 100644
--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/mpu/utils.py b/megatron/mpu/utils.py
index cb12f2569..13941dc29 100644
--- a/megatron/mpu/utils.py
+++ b/megatron/mpu/utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
index 42a340f29..a5f47bb7c 100644
--- a/megatron/neox_arguments/arguments.py
+++ b/megatron/neox_arguments/arguments.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 import copy
 import torch
 import argparse
+from pkg_resources import packaging
+from importlib.metadata import version
 
 from dataclasses import dataclass
 from typing import List, Dict
@@ -896,7 +898,6 @@ def calculate_derived(self):
                 "gradient_accumulation_steps": gradient_accumulation_steps,
                 "batch_size": train_micro_batch_size_per_gpu,
                 # duplicate items
-                "gas": self.gradient_accumulation_steps,
                 "clip_grad": self.gradient_clipping,
             }
         )
@@ -1065,6 +1066,32 @@ def calculate_derived(self):
             # Can't have a default value as an empty dict so need to set it here
             self.update_value("sparsity_config", {})
 
+        # Multi-query or grouped-query attention settings
+        if self.num_kv_heads is not None:
+            # need KV heads <= query heads, and KV heads dividing query heads evenly
+            assert (
+                self.num_attention_heads % self.num_kv_heads == 0
+            ), "num_kv_heads must evenly divide num_attention_heads and be no greater than it"
+
+            if self.num_kv_heads < self.num_attention_heads:
+                # GQA / MQA not compatible with sparse attention configurations
+                assert (
+                    not self.sparsity_config
+                ), "Sparse attention not compatible with GQA or MQA"
+                assert all(
+                    (attn_type == "flash") or (attn_type == "global")
+                    for attn_type in self.attention_config
+                ), "GQA / MQA currently only compatible with Flash or standard global/sliding window Attention"
+                assert (
+                    self.num_kv_heads % self.model_parallel_size == 0
+                ), "Number of KV heads must be at least model_parallel_size for now!"
+        # Flash attention version >=2.3.0 required to combine Flash + Sliding Window Attention
+        if self.sliding_window_width is not None and "flash" in self.attention_config:
+            _flash_version = packaging.version.Version(version("flash-attn"))
+            assert _flash_version >= packaging.version.Version(
+                "2.3.0"
+            ), f"Flash-Attention version ({str(_flash_version)}) must be >= 2.3.0 to support sliding window attention."
+
         # Adding equal dataset weights if none are provided
         if self.train_data_paths and (self.train_data_weights is None):
             self.train_data_weights = [1.0] * len(self.train_data_paths)
diff --git a/megatron/neox_arguments/deepspeed_args.py b/megatron/neox_arguments/deepspeed_args.py
index 071c3d3b7..708a5f5b1 100644
--- a/megatron/neox_arguments/deepspeed_args.py
+++ b/megatron/neox_arguments/deepspeed_args.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
index ccc4813d5..26e02b91d 100644
--- a/megatron/neox_arguments/neox_args.py
+++ b/megatron/neox_arguments/neox_args.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -110,9 +110,29 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Transformer hidden size.
     """
 
+    intermediate_size: int = None
+    """
+    Transformer intermediate size. Currently only used for "mlp_type": "llama".
+
+    If not passed, will be set to a reasonable default.
+    """
+
     num_attention_heads: int = None
     """
     Number of transformer attention heads.
+
+    If num_kv_heads is set, will control only number of query heads.
+    """
+
+    num_kv_heads: int = None
+    """
+    Number of transformer key/value attention heads.
+
+    If set to None or the same value as num_attention_heads, will perform multi-head attention (MHA).
+    If set to < num_attention_heads but > 1, will perform grouped-query attention (GQA) (https://arxiv.org/pdf/2305.13245.pdf)
+    If set to 1, will perform multi-query attention.
+
+    Must be < num_attention_heads and divide num_attention_heads evenly.
     """
 
     seq_length: int = None
@@ -120,6 +140,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Maximum sequence length to process.
     """
 
+    sliding_window_width: int = None
+    """
+    Width of the attention sliding window. Only supported with Flash Attention 2.
+    """
+
     max_position_embeddings: int = None
     """
     Maximum number of position embeddings to use. This is the size of position embedding.
@@ -130,6 +155,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
     """
 
+    layernorm_fusion: bool = False
+    """
+    Use fused layer norm kernel (if `norm` is `layernorm`).
+    """
+
     use_qk_layernorm: bool = False
     """
     Use QK Normalization
@@ -573,6 +603,39 @@ class NeoXArgsLogging(NeoXArgsTemplate):
     Whether to offload the buffered gradients to cpu when measuring gradient noise scale.
     """
 
+    memory_profiling: bool = False
+    """
+    Whether to take a memory snapshot of the model. Useful for debugging memory issues.
+    """
+
+    memory_profiling_path: str = None
+    """
+    Path to save memory snapshot to.
+    """
+
+    profile: bool = False
+    """
+    Enable nsys profiling. When using this option,
+    nsys options should be specified in commandline.
+    An example nsys commandline is
+    ```
+    nsys profile -s none -t nvtx,cuda -o <path/to/output_file>
+    --force-overwrite true
+    --capture-range=cudaProfilerApi
+    --capture-range-end=stop
+    ```
+    """
+
+    profile_step_start: int = 10
+    """
+    Step to start profiling at.
+    """
+
+    profile_step_stop: int = 12
+    """
+    Step to stop profiling at.
+    """
+
 
 @dataclass
 class NeoXArgsOther(NeoXArgsTemplate):
@@ -996,9 +1059,6 @@ class NeoXArgsTraining(NeoXArgsTemplate):
     Partition Activations across GPUs before checkpointing.
     """
 
-    gas: int = None
-    """gradient_accumulation_steps"""  # TODO this is a duplicate, remove?
-
     clip_grad: float = 1.0
     """
     Gradient clipping based on global L2 norm.
@@ -1158,6 +1218,8 @@ class NeoXArgsTextgen(NeoXArgsTemplate):
     eval_tasks: list = None
     """
     Tasks to evaluate on using lm_eval_harness
+
+    NOTE: Requires internet connection
     """
 
     moe_top_k: int = 2
diff --git a/megatron/neox_arguments/template.py b/megatron/neox_arguments/template.py
index d0216459f..2e8341990 100644
--- a/megatron/neox_arguments/template.py
+++ b/megatron/neox_arguments/template.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/optimizers.py b/megatron/optimizers.py
index fcf8a44c7..93515ed14 100644
--- a/megatron/optimizers.py
+++ b/megatron/optimizers.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
index 5eb982384..7b7a390ab 100644
--- a/megatron/text_generation_utils.py
+++ b/megatron/text_generation_utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/tokenizer/__init__.py b/megatron/tokenizer/__init__.py
index 22b0f7b9e..b53bbb828 100644
--- a/megatron/tokenizer/__init__.py
+++ b/megatron/tokenizer/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 504890763..348c7cefe 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/tokenizer/train_tokenizer.py b/megatron/tokenizer/train_tokenizer.py
index 72b244a9b..3027677e7 100644
--- a/megatron/tokenizer/train_tokenizer.py
+++ b/megatron/tokenizer/train_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/megatron/training.py b/megatron/training.py
index 31f8b9c66..6dead1462 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -55,7 +55,9 @@
     CharCounter,
 )
 from megatron.model.gpt2_model import cross_entropy
-from eval_tasks import run_eval_harness
+
+from pickle import dump
+import os
 
 
 def mup_weights_reinit(neox_args, model):
@@ -292,6 +294,7 @@ def _get_batch(neox_args, tokenizer, keys, data, datatype):
         data=tokens,
         eod_token=neox_args.tokenizer.eod,
         eod_mask_loss=neox_args.eod_mask_loss,
+        sliding_window_width=neox_args.sliding_window_width,
     )
     # If `label` is present, any token < 0 (e.g., -100, the default for torch) skips the loss computation
     if "label" in data_b:
@@ -369,6 +372,8 @@ def forward_step(
         return model.eval_batch(data_iterator, return_logits=return_logits)
 
     # Get the batch.
+    if neox_args.memory_profiling and neox_args.it:
+        torch.cuda.nvtx.range_push(f"Get batch")
     if timers is not None:
         timers("batch generator").start()
     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
@@ -377,7 +382,11 @@ def forward_step(
 
     if timers is not None:
         timers("batch generator").stop()
+    if neox_args.memory_profiling:
+        torch.cuda.nvtx.range_pop()
 
+    if neox_args.memory_profiling:
+        torch.cuda.nvtx.range_push(f"Forward pass")
     # Sequential returns moe_losses, but this is not yet supported by pipe parallel
     maybe_tuple = model((tokens, position_ids, attention_mask), neox_args=neox_args)
     if type(maybe_tuple) is tuple:
@@ -395,6 +404,8 @@ def forward_step(
     main_loss = cross_entropy(
         outputs, (labels, loss_mask), _fp16=neox_args.fp16_lm_cross_entropy
     )
+    if neox_args.memory_profiling:
+        torch.cuda.nvtx.range_pop()
     moe_loss = neox_args.moe_loss_coeff * sum(m.item() for m in moe_losses)
     loss = main_loss + moe_loss
     if return_logits:
@@ -647,6 +658,15 @@ def get_learning_rate_scheduler(optimizer, neox_args):
 
 
 def setup_model_and_optimizer(neox_args, use_cache=False, iteration=None):
+    """Setup memory profiler"""
+    if neox_args.memory_profiling:
+        torch.cuda.memory._record_memory_history(
+            True,
+            # keep a maximum 100,000 alloc/free events from before the snapshot
+            trace_alloc_max_entries=100000,
+            trace_alloc_record_context=True,
+        )
+
     """Setup model and optimizer."""
     model = get_model(neox_args=neox_args, use_cache=use_cache)
     optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args)
@@ -746,6 +766,13 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
         reduced_loss = train_step_pipe(
             neox_args=neox_args, timers=timers, model=model, data_iterator=data_iterator
         )
+        if (
+            neox_args.memory_profiling
+            and neox_args.iteration >= neox_args.profile_step_start
+            and neox_args.iteration <= neox_args.profile_step_stop
+            and torch.distributed.get_rank() == 0
+        ):
+            save_snapshot(neox_args)
     else:
         losses = []
         for _ in range(neox_args.gradient_accumulation_steps):
@@ -761,6 +788,12 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
             timers("forward").stop()
             losses.append(loss)
             # Calculate gradients, reduce across processes, and clip.
+            if (
+                neox_args.profile
+                and neox_args.iteration >= neox_args.profile_step_start
+                and neox_args.iteration <= neox_args.profile_step_stop
+            ):
+                torch.cuda.nvtx.range_push(f"Backward pass")
             timers("backward").start()
             backward_step(
                 neox_args=neox_args,
@@ -770,13 +803,38 @@ def train_step(neox_args, timers, data_iterator, model, optimizer, lr_scheduler)
                 loss=loss,
             )
             timers("backward").stop()
+            if (
+                neox_args.profile
+                and neox_args.iteration >= neox_args.profile_step_start
+                and neox_args.iteration <= neox_args.profile_step_stop
+            ):
+                torch.cuda.nvtx.range_pop()
             # Update parameters.
+            if (
+                neox_args.profile
+                and neox_args.iteration >= neox_args.profile_step_start
+                and neox_args.iteration <= neox_args.profile_step_stop
+            ):
+                torch.cuda.nvtx.range_push(f"Optimizer step")
             timers("optimizer").start()
             if neox_args.deepspeed:
                 model.step()
             else:
                 raise ValueError("Must be using deepspeed to run neox")
             timers("optimizer").stop()
+            if (
+                neox_args.profile
+                and neox_args.iteration >= neox_args.profile_step_start
+                and neox_args.iteration <= neox_args.profile_step_stop
+            ):
+                torch.cuda.nvtx.range_pop()
+            if (
+                neox_args.profile
+                and neox_args.iteration >= neox_args.profile_step_start
+                and neox_args.iteration <= neox_args.profile_step_stop
+                and torch.distributed.get_rank() == 0
+            ):
+                save_snapshot(neox_args)
         reduced_loss = {
             "lm_loss": reduce_losses(losses).mean()
         }  # reduces losses across machines for logging
@@ -838,6 +896,8 @@ def train(
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
     while iteration < neox_args.train_iters:
+        if neox_args.profile and iteration == neox_args.profile_step_start:
+            torch.cuda.cudart().cudaProfilerStart()
         loss_dict, skipped_iter = train_step(
             neox_args=neox_args,
             timers=timers,
@@ -846,6 +906,8 @@ def train(
             optimizer=optimizer,
             lr_scheduler=lr_scheduler,
         )
+        if neox_args.profile and iteration == neox_args.profile_step_stop:
+            torch.cuda.cudart().cudaProfilerStop()
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":
@@ -946,7 +1008,7 @@ def evaluate(
 
             # although we're not accumulating gradients here, we count one iter as train_batch_size_per_gpu * g.a.s
             # to be consistent with deepspeed's pipe parallel engine
-            # since pipe parallel already takes gas into account - default to 1 here if pipe parallel is true
+            # since pipe parallel already takes gradient_accumulation_steps into account - default to 1 here if pipe parallel is true
             for _ in range(
                 1
                 if neox_args.is_pipe_parallel
@@ -985,6 +1047,8 @@ def evaluate(
         )
 
     if neox_args.eval_tasks:
+        from eval_tasks import run_eval_harness
+
         eval_results.update(
             run_eval_harness(
                 model, forward_step_fn, neox_args, eval_tasks=neox_args.eval_tasks
@@ -1050,3 +1114,15 @@ def evaluate_and_print_results(
     print_rank_0("-" * length)
     print_rank_0(string)
     print_rank_0("-" * length)
+
+
+def save_snapshot(neox_args):
+    assert (
+        neox_args.memory_profiling_path is not None
+    ), "Must pass memory_profiling_path config arg to use profiling"
+    snapshot = torch.cuda.memory._snapshot()
+    snapshot_path = os.path.join(neox_args.memory_profiling_path)
+    if not os.path.exists(snapshot_path):
+        os.makedirs(snapshot_path)
+    with open(os.path.join(snapshot_path, "mem_snapshot.pickle"), "wb") as f:
+        dump(snapshot, f)
diff --git a/megatron/utils.py b/megatron/utils.py
index 3769495d9..26b4439bd 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ def report_memory(name):
     print_rank_0(string)
 
 
-def get_attn_mask(seq_length, device):
+def get_attn_mask(seq_length, device, sliding_window_width):
     """
     Get triangular attention mask for a given sequence length / device.
     """
@@ -72,6 +72,9 @@ def get_attn_mask(seq_length, device):
     mask = torch.tril(torch.ones((1, seq_length, seq_length), device=device)).view(
         1, 1, seq_length, seq_length
     )
+    # get rid of lower diagonals than the sliding window width, if a value was provided
+    if sliding_window_width is not None:
+        mask = torch.triu(mask, diagonal=-sliding_window_width)
 
     # convert to binary
     return mask < 0.5
@@ -81,6 +84,7 @@ def get_ltor_masks_and_position_ids(
     data,
     eod_token,
     eod_mask_loss=False,
+    sliding_window_width=None,
 ):
     """Build masks and position id for left to right model."""
 
@@ -91,6 +95,7 @@ def get_ltor_masks_and_position_ids(
     attention_mask = get_attn_mask(
         seq_length=seq_length,
         device=data.device,
+        sliding_window_width=sliding_window_width,
     )
 
     # Loss mask.
@@ -484,3 +489,62 @@ def __next__(self):
         end = time.time()
         self.total_time += end - start
         return batch
+
+
+def _kernel_make_viewless_tensor(inp, requires_grad):
+    """Make a viewless tensor.
+
+    View tensors have the undesirable side-affect of retaining a reference
+    to the originally-viewed tensor, even after manually setting the '.data'
+    field. This method creates a new tensor that links to the old tensor's
+    data, without linking the viewed tensor, referenced via the '._base'
+    field.
+    """
+    out = torch.empty(
+        (1,),
+        dtype=inp.dtype,
+        device=inp.device,
+        requires_grad=requires_grad,
+    )
+    out.data = inp.data
+    return out
+
+
+class MakeViewlessTensor(torch.autograd.Function):
+    """
+    Autograd function to make a viewless tensor.
+
+    This function should be used in cases where the computation graph needs
+    to be propagated, but we only want a viewless tensor (e.g.,
+    ParallelTransformer's hidden_states). Call this function by passing
+    'keep_graph = True' to 'make_viewless_tensor()'.
+    """
+
+    @staticmethod
+    def forward(ctx, inp, requires_grad):
+        return _kernel_make_viewless_tensor(inp, requires_grad)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output, None
+
+
+def make_viewless_tensor(inp, requires_grad, keep_graph):
+    """
+    Entry-point for creating viewless tensors.
+
+    This method should be used, rather than calling 'MakeViewlessTensor'
+    or '_kernel_make_viewless_tensor' directly. This method acts as a
+    switch for determining if an autograd function or a regular method
+    should be used to create the tensor.
+    """
+
+    # return tensor as-is, if not a 'view'
+    if inp._base is None:
+        return inp
+
+    # create viewless tensor
+    if keep_graph:
+        return MakeViewlessTensor.apply(inp, requires_grad)
+    else:
+        return _kernel_make_viewless_tensor(inp, requires_grad)
diff --git a/prepare_data.py b/prepare_data.py
index 86d8b5f5a..62363e27b 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/requirements/requirements-apex-pip.txt b/requirements/requirements-apex-pip.txt
new file mode 100644
index 000000000..981f54a30
--- /dev/null
+++ b/requirements/requirements-apex-pip.txt
@@ -0,0 +1 @@
+pip==23.3.2
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 6c58478f3..60ff3224f 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -4,4 +4,5 @@ pre-commit>=2.17.0
 pytest>=6.2.3
 pytest-cov>=2.11.1
 pytest-forked>=1.3.0
+pytest-html==4.1.1
 pytest-xdist
diff --git a/requirements/requirements-sparseattention.txt b/requirements/requirements-sparseattention.txt
index 09386fdcb..3b382f83f 100644
--- a/requirements/requirements-sparseattention.txt
+++ b/requirements/requirements-sparseattention.txt
@@ -1 +1 @@
-triton==2.0.0.dev20221202
+triton==2.1.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 4c9a3ca64..6247ece3d 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -4,7 +4,7 @@ ftfy>=6.0.1
 git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836
 huggingface_hub>=0.11.0
 jinja2==3.1.3
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@main#egg=lm_eval
+lm_eval>=0.4.0,<=0.4.1
 mpi4py>=3.0.3
 numpy>=1.22.0
 pybind11>=2.6.2
diff --git a/tests/common.py b/tests/common.py
index d2659b453..c63ced0f7 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,15 +17,20 @@
 import shutil
 import itertools
 from pathlib import Path
+from abc import ABC, abstractmethod
+from deepspeed.accelerator import get_accelerator
 
 import pytest
+from _pytest.outcomes import Skipped
+from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 import random
 import train
 
 import torch
+
 import torch.distributed as dist
 from torch.multiprocessing import Process
-import multiprocessing as mp
+import torch.multiprocessing as mp
 from yaml import load
 
 try:
@@ -41,6 +46,7 @@
 
 # Worker timeout *after* the first worker has completed.
 DEEPSPEED_UNIT_WORKER_TIMEOUT = 120
+DEEPSPEED_TEST_TIMEOUT = 600
 
 
 def get_xdist_worker_id():
@@ -62,6 +68,58 @@ def get_master_port():
 _num_gpus = None
 
 
+def set_accelerator_visible():
+    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+    xdist_worker_id = get_xdist_worker_id()
+    if xdist_worker_id is None:
+        xdist_worker_id = 0
+    if cuda_visible is None:
+        # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
+        if get_accelerator().device_name() == "cuda":
+            if is_rocm_pytorch():
+                rocm_smi = subprocess.check_output(["rocm-smi", "--showid"])
+                gpu_ids = filter(
+                    lambda s: "GPU" in s, rocm_smi.decode("utf-8").strip().split("\n")
+                )
+                num_accelerators = len(list(gpu_ids))
+            else:
+                nvidia_smi = subprocess.check_output(["nvidia-smi", "--list-gpus"])
+                num_accelerators = len(nvidia_smi.decode("utf-8").strip().split("\n"))
+        elif get_accelerator().device_name() == "xpu":
+            clinfo = subprocess.check_output(["clinfo"])
+            lines = clinfo.decode("utf-8").strip().split("\n")
+            num_accelerators = 0
+            for line in lines:
+                match = re.search("Device Type.*GPU", line)
+                if match:
+                    num_accelerators += 1
+        elif get_accelerator().device_name() == "npu":
+            npu_smi = subprocess.check_output(["npu-smi", "info", "-l"])
+            num_accelerators = int(
+                npu_smi.decode("utf-8").strip().split("\n")[0].split(":")[1].strip()
+            )
+        else:
+            assert get_accelerator().device_name() == "cpu"
+            cpu_sockets = int(
+                subprocess.check_output(
+                    'cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l',
+                    shell=True,
+                )
+            )
+            num_accelerators = cpu_sockets
+
+        cuda_visible = ",".join(map(str, range(num_accelerators)))
+
+    # rotate list based on xdist worker id, example below
+    # wid=0 -> ['0', '1', '2', '3']
+    # wid=1 -> ['1', '2', '3', '0']
+    # wid=2 -> ['2', '3', '0', '1']
+    # wid=3 -> ['3', '0', '1', '2']
+    dev_id_list = cuda_visible.split(",")
+    dev_id_list = dev_id_list[xdist_worker_id:] + dev_id_list[:xdist_worker_id]
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(dev_id_list)
+
+
 def count_gpus():
     global _num_gpus
     if _num_gpus is None:
@@ -121,117 +179,298 @@ def clear_test_dirs():
         shutil.rmtree(tensorboard_dir)
 
 
-def distributed_test(world_size=2, backend="nccl"):
-    """A decorator for executing a function (e.g., a unit test) in a distributed manner.
-    This decorator manages the spawning and joining of processes, initialization of
-    torch.distributed, and catching of errors.
+class DistributedExec(ABC):
+    """
+    Base class for distributed execution of functions/methods. Contains common
+    methods needed for DistributedTest and DistributedFixture.
+    """
 
-    This function is copied from: https://github.com/EleutherAI/DeeperSpeed/blob/24026e5bb37c528a222b8635c46256b1e1825d2e/tests/unit/common.py#L16
+    world_size = 2
+    backend = get_accelerator().communication_backend_name()
+    init_distributed = True
+    set_dist_env = True
+    requires_cuda_env = True
+    reuse_dist_env = False
+    _pool_cache = {}
+    exec_timeout = DEEPSPEED_TEST_TIMEOUT
+
+    @abstractmethod
+    def run(self):
+        ...
+
+    def __call__(self, request=None):
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
+        world_size = self.world_size
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            self._launch_procs(procs)
+
+    def _get_fixture_kwargs(self, request, func):
+        if not request:
+            return {}
+        # Grab fixture / parametrize kwargs from pytest request object
+        fixture_kwargs = {}
+        params = inspect.getfullargspec(func).args
+        params.remove("self")
+        for p in params:
+            try:
+                fixture_kwargs[p] = request.getfixturevalue(p)
+            except FixtureLookupError:
+                pass  # test methods can have kwargs that are not fixtures
+        return fixture_kwargs
+
+    def _launch_procs(self, num_procs):
+        # Verify we have enough accelerator devices to run this test
+        if (
+            get_accelerator().is_available()
+            and get_accelerator().device_count() < num_procs
+        ):
+            pytest.skip(
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
+            )
+
+        mp.set_start_method("spawn", force=True)
+
+        # Create process pool or use cached one
+        master_port = None
+        if self.reuse_dist_env:
+            if num_procs not in self._pool_cache:
+                self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
+                master_port = get_master_port()
+            pool = self._pool_cache[num_procs]
+        else:
+            pool = mp.Pool(processes=num_procs)
+            master_port = get_master_port()
+
+        # Run the test
+        args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
+        skip_msgs_async = pool.starmap_async(self._dist_run, args)
+
+        try:
+            skip_msgs = skip_msgs_async.get(self.exec_timeout)
+        except mp.TimeoutError:
+            # Shortcut to exit pytest in the case of a hanged test. This
+            # usually means an environment error and the rest of tests will
+            # hang (causing super long unit test runtimes)
+            pytest.exit("Test hanged, exiting", returncode=0)
+
+        # Tear down distributed environment and close process pools
+        self._close_pool(pool, num_procs)
+
+        # If we skipped a test, propagate that to this process
+        if any(skip_msgs):
+            assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
+            pytest.skip(skip_msgs[0])
+
+    def _dist_run(self, local_rank, num_procs, master_port):
+        skip_msg = ""
+        if not dist.is_initialized():
+            """Initialize deepspeed.comm and execute the user function."""
+            if self.set_dist_env:
+                os.environ["MASTER_ADDR"] = "127.0.0.1"
+                os.environ["MASTER_PORT"] = str(master_port)
+                os.environ["LOCAL_RANK"] = str(local_rank)
+                # NOTE: unit tests don't support multi-node so local_rank == global rank
+                os.environ["RANK"] = str(local_rank)
+                # In case of multiprocess launching LOCAL_SIZE should be same as WORLD_SIZE
+                # DeepSpeed single node launcher would also set LOCAL_SIZE accordingly
+                os.environ["LOCAL_SIZE"] = str(num_procs)
+                os.environ["WORLD_SIZE"] = str(num_procs)
+
+            # turn off NCCL logging if set
+            os.environ.pop("NCCL_DEBUG", None)
+
+            if get_accelerator().is_available():
+                set_accelerator_visible()
+
+            if get_accelerator().is_available():
+                get_accelerator().set_device(local_rank)
+
+            if self.init_distributed:
+                deepspeed.init_distributed(dist_backend=self.backend)
+                dist.barrier()
+
+        try:
+            self.run(**self._fixture_kwargs)
+        except BaseException as e:
+            if isinstance(e, Skipped):
+                skip_msg = e.msg
+            else:
+                raise e
+
+        return skip_msg
+
+    def _dist_destroy(self):
+        if (dist is not None) and dist.is_initialized():
+            dist.barrier()
+            dist.destroy_process_group()
+
+    def _close_pool(self, pool, num_procs, force=False):
+        if force or not self.reuse_dist_env:
+            msg = pool.starmap(self._dist_destroy, [() for _ in range(num_procs)])
+            pool.close()
+            pool.join()
 
-    Usage example:
-        @distributed_test(worker_size=[2,3])
-        def my_test():
-            rank = dist.get_rank()
-            world_size = dist.get_world_size()
-            assert(rank < world_size)
 
-    Arguments:
-        world_size (int or list): number of ranks to spawn. Can be a list to spawn
-        multiple tests.
+class DistributedFixture(DistributedExec):
+    """
+    Implementation that extends @pytest.fixture to allow for distributed execution.
+    This is primarily meant to be used when a test requires executing two pieces of
+    code with different world sizes.
+
+    There are 2 parameters that can be modified:
+        - world_size: int = 2 -- the number of processes to launch
+        - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use
+
+    Features:
+        - able to call pytest.skip() inside fixture
+        - can be reused by multiple tests
+        - can accept other fixtures as input
+
+    Limitations:
+        - cannot use @pytest.mark.parametrize
+        - world_size cannot be modified after definition and only one world_size value is accepted
+        - any fixtures used must also be used in the test that uses this fixture (see example below)
+        - return values cannot be returned. Passing values to a DistributedTest
+          object can be achieved using class_tmpdir and writing to file (see example below)
+
+    Usage:
+        - must implement a run(self, ...) method
+        - fixture can be used by making the class name input to a test function
+
+    Example:
+        @pytest.fixture(params=[10,20])
+        def regular_pytest_fixture(request):
+            return request.param
+
+        class distributed_fixture_example(DistributedFixture):
+            world_size = 4
+
+            def run(self, regular_pytest_fixture, class_tmpdir):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                local_rank = os.environ["LOCAL_RANK"]
+                print(f"Rank {local_rank} with value {regular_pytest_fixture}")
+                with open(os.path.join(class_tmpdir, f"{local_rank}.txt"), "w") as f:
+                    f.write(f"{local_rank},{regular_pytest_fixture}")
+
+        class TestExample(DistributedTest):
+            world_size = 1
+
+            def test(self, distributed_fixture_example, regular_pytest_fixture, class_tmpdir):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                for rank in range(4):
+                    with open(os.path.join(class_tmpdir, f"{rank}.txt"), "r") as f:
+                        assert f.read() == f"{rank},{regular_pytest_fixture}"
     """
 
-    def dist_wrap(run_func):
-        """Second-level decorator for dist_test. This actually wraps the function."""
+    is_dist_fixture = True
 
-        def dist_init(local_rank, num_procs, *func_args, **func_kwargs):
-            """Initialize torch.distributed and execute the user function."""
-            os.environ["MASTER_ADDR"] = "127.0.0.1"
-            os.environ["MASTER_PORT"] = get_master_port()
-            os.environ["LOCAL_RANK"] = str(local_rank)
-            # NOTE: unit tests don't support multi-node so local_rank == global rank
-            os.environ["RANK"] = str(local_rank)
-            os.environ["WORLD_SIZE"] = str(num_procs)
+    # These values are just placeholders so that pytest recognizes this as a fixture
+    _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None)
+    __name__ = ""
 
-            # turn off NCCL logging if set
-            os.environ.pop("NCCL_DEBUG", None)
+    def __init__(self):
+        assert isinstance(
+            self.world_size, int
+        ), "Only one world size is allowed for distributed fixtures"
+        self.__name__ = type(self).__name__
+        _pytestfixturefunction = FixtureFunctionMarker(
+            scope="function", params=None, name=self.__name__
+        )
 
-            deepspeed.init_distributed(dist_backend=backend)
 
-            if torch.cuda.is_available():
-                torch.cuda.set_device(local_rank)
+class DistributedTest(DistributedExec):
+    """
+    Implementation for running pytest with distributed execution.
+
+    There are 2 parameters that can be modified:
+        - world_size: Union[int,List[int]] = 2 -- the number of processes to launch
+        - backend: Literal['nccl','mpi','gloo'] = 'nccl' -- which backend to use
+
+    Features:
+        - able to call pytest.skip() inside tests
+        - works with pytest fixtures, parametrize, mark, etc.
+        - can contain multiple tests (each of which can be parametrized separately)
+        - class methods can be fixtures (usable by tests in this class only)
+        - world_size can be changed for individual tests using @pytest.mark.world_size(world_size)
+        - class_tmpdir is a fixture that can be used to get a tmpdir shared among
+          all tests (including DistributedFixture)
+
+    Usage:
+        - class name must start with "Test"
+        - must implement one or more test*(self, ...) methods
+
+    Example:
+        @pytest.fixture(params=[10,20])
+        def val1(request):
+            return request.param
+
+        @pytest.mark.fast
+        @pytest.mark.parametrize("val2", [30,40])
+        class TestExample(DistributedTest):
+            world_size = 2
+
+            @pytest.fixture(params=[50,60])
+            def val3(self, request):
+                return request.param
+
+            def test_1(self, val1, val2, str1="hello world"):
+                assert int(os.environ["WORLD_SIZE"]) == self.world_size
+                assert all(val1, val2, str1)
+
+            @pytest.mark.world_size(1)
+            @pytest.mark.parametrize("val4", [70,80])
+            def test_2(self, val1, val2, val3, val4):
+                assert int(os.environ["WORLD_SIZE"]) == 1
+                assert all(val1, val2, val3, val4)
+    """
 
-            run_func(*func_args, **func_kwargs)
+    def __init__(self):
+        self.is_dist_test = True
 
-            # make sure all ranks finish at the same time
-            torch.distributed.barrier()
-            # tear down after test completes
-            torch.distributed.destroy_process_group()
+    # Temporary directory that is shared among test methods in a class
+    @pytest.fixture(autouse=True, scope="class")
+    def class_tmpdir(self, tmpdir_factory):
+        fn = tmpdir_factory.mktemp(self.__class__.__name__)
+        return fn
 
-        def dist_launcher(num_procs, *func_args, **func_kwargs):
-            """Launch processes and gracefully handle failures."""
+    def run(self, **fixture_kwargs):
+        self._current_test(**fixture_kwargs)
 
-            # Spawn all workers on subprocesses.
-            processes = []
-            for local_rank in range(num_procs):
-                p = Process(
-                    target=dist_init,
-                    args=(local_rank, num_procs, *func_args),
-                    kwargs=func_kwargs,
-                )
-                p.start()
-                processes.append(p)
-
-            # Now loop and wait for a test to complete. The spin-wait here isn't a big
-            # deal because the number of processes will be O(#GPUs) << O(#CPUs).
-            any_done = False
-            while not any_done:
-                for p in processes:
-                    if not p.is_alive():
-                        any_done = True
-                        break
-
-            # Wait for all other processes to complete
-            for p in processes:
-                p.join(DEEPSPEED_UNIT_WORKER_TIMEOUT)
-
-            failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
-            for rank, p in failed:
-                # If it still hasn't terminated, kill it because it hung.
-                if p.exitcode is None:
-                    p.terminate()
-                    pytest.fail(f"Worker {rank} hung.", pytrace=False)
-                if p.exitcode < 0:
-                    pytest.fail(
-                        f"Worker {rank} killed by signal {-p.exitcode}", pytrace=False
-                    )
-                if p.exitcode > 0:
-                    pytest.fail(
-                        f"Worker {rank} exited with code {p.exitcode}", pytrace=False
-                    )
-
-        def run_func_decorator(*func_args, **func_kwargs):
-            """Entry point for @distributed_test()."""
-
-            gpus = count_gpus()
-
-            if isinstance(world_size, int):
-                if gpus < world_size:
-                    pytest.mark.skip(
-                        reason=f"at least {world_size} GPUs are required to run this test"
-                    )
-                    return
-
-                dist_launcher(world_size, *func_args, **func_kwargs)
-            elif isinstance(world_size, list):
-                for procs in world_size:
-                    dist_launcher(procs, *func_args, **func_kwargs)
-                    time.sleep(0.5)
-            else:
-                raise TypeError(f"world_size must be an integer or a list of integers.")
+    def __call__(self, request):
+        self._current_test = self._get_current_test_func(request)
+        self._fixture_kwargs = self._get_fixture_kwargs(request, self._current_test)
+
+        if self.requires_cuda_env and not get_accelerator().is_available():
+            pytest.skip("only supported in accelerator environments.")
+
+        # Catch world_size override pytest mark
+        for mark in getattr(request.function, "pytestmark", []):
+            if mark.name == "world_size":
+                world_size = mark.args[0]
+                break
+        else:
+            world_size = self.world_size
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            self._launch_procs(procs)
+            time.sleep(0.5)
+
+    def _get_current_test_func(self, request):
+        # DistributedTest subclasses may have multiple test methods
+        func_name = request.function.__name__
+        return getattr(self, func_name)
 
-        return run_func_decorator
 
-    return dist_wrap
+def get_test_path(filename):
+    curr_path = Path(__file__).parent
+    return str(curr_path.joinpath(filename))
 
 
 def model_setup(yaml_list=None, param_dict=None, clear_data=True):
@@ -388,3 +627,4 @@ def dict_repr(d):
 
 with open("tests/config/test_setup.yml", "r") as f:
     BASE_CONFIG = load(f, Loader=Loader)
+    print(f"Base Config:\n{BASE_CONFIG}")
diff --git a/tests/config/test_setup.yml b/tests/config/test_setup.yml
index c4de20940..882bf7e63 100644
--- a/tests/config/test_setup.yml
+++ b/tests/config/test_setup.yml
@@ -28,6 +28,7 @@
   "scaled_upper_triang_masked_softmax_fusion": false,
   "bias_gelu_fusion": false,
   "rope_fusion": false,
+  "layernorm_fusion": false,
 
   # Optimizer
   "optimizer": {
@@ -43,7 +44,7 @@
   "output_layer_init_method": "wang_init",
 
   "train_micro_batch_size_per_gpu": 4,
-  "gas": 1,
+  "gradient_accumulation_steps": 1,
   "data_impl": "mmap",
   "num_workers": 1,
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..917dd8543
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# tests directory-specific settings - this file is run automatically by pytest before any tests are run
+
+import sys
+import pytest
+import os
+from os.path import abspath, dirname, join
+import torch
+import warnings
+
+# Set this environment variable for the T5 inference unittest(s) (e.g. google/t5-v1_1-small)
+os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
+
+# allow having multiple repository checkouts and not needing to remember to rerun
+# 'pip install -e .[dev]' when switching between checkouts and running tests.
+git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+sys.path.insert(1, git_repo_path)
+
+
+def pytest_configure(config):
+    #    config.option.color = "yes"
+    #    config.option.durations = 0
+    #    config.option.durations_min = 1
+    config.option.verbose = True
+
+
+def pytest_addoption(parser):
+    parser.addoption("--torch_ver", default=None, type=str)
+    parser.addoption("--cuda_ver", default=None, type=str)
+
+
+def validate_version(expected, found):
+    version_depth = expected.count(".") + 1
+    found = ".".join(found.split(".")[:version_depth])
+    return found == expected
+
+
+@pytest.fixture(scope="session", autouse=True)
+def check_environment(pytestconfig):
+    expected_torch_version = pytestconfig.getoption("torch_ver")
+    expected_cuda_version = pytestconfig.getoption("cuda_ver")
+    if expected_torch_version is None:
+        warnings.warn(
+            "Running test without verifying torch version, please provide an expected torch version with --torch_ver"
+        )
+    elif not validate_version(expected_torch_version, torch.__version__):
+        pytest.exit(
+            f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}",
+            returncode=2,
+        )
+    if expected_cuda_version is None:
+        warnings.warn(
+            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver"
+        )
+    elif not validate_version(expected_cuda_version, torch.version.cuda):
+        pytest.exit(
+            f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}",
+            returncode=2,
+        )
+
+
+# Override of pytest "runtest" for DistributedTest class
+# This hook is run before the default pytest_runtest_call
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_call(item):
+    # We want to use our own launching function for distributed tests
+    if getattr(item.cls, "is_dist_test", False):
+        dist_test_class = item.cls()
+        dist_test_class(item._request)
+        item.runtest = lambda: True  # Dummy function so test is not run twice
+
+
+# We allow DistributedTest to reuse distributed environments. When the last
+# test for a class is run, we want to make sure those distributed environments
+# are destroyed.
+def pytest_runtest_teardown(item, nextitem):
+    if getattr(item.cls, "reuse_dist_env", False) and not nextitem:
+        dist_test_class = item.cls()
+        for num_procs, pool in dist_test_class._pool_cache.items():
+            dist_test_class._close_pool(pool, num_procs, force=True)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_fixture_setup(fixturedef, request):
+    if getattr(fixturedef.func, "is_dist_fixture", False):
+        dist_fixture_class = fixturedef.func()
+        dist_fixture_class(request)
diff --git a/tests/model/__init__.py b/tests/model/__init__.py
index b9c364917..d38c7d4d0 100644
--- a/tests/model/__init__.py
+++ b/tests/model/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .test_model_instantiation import run_test_model_instantiation
-from .test_model_train import run_train_test
-from .test_model_checkpoint import run_checkpoint_test
diff --git a/tests/model/test_fused_kernels.py b/tests/model/test_fused_kernels.py
index be068099e..b8cb34d1b 100644
--- a/tests/model/test_fused_kernels.py
+++ b/tests/model/test_fused_kernels.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/model/test_model_checkpoint.py b/tests/model/test_model_checkpoint.py
index e73e1a0fe..96f51683b 100644
--- a/tests/model/test_model_checkpoint.py
+++ b/tests/model/test_model_checkpoint.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 import pytest
 from tests.common import (
-    distributed_test,
+    DistributedTest,
     clear_test_dirs,
     model_setup,
     binary,
@@ -73,60 +73,58 @@ def test_train(param_dict):
     d = tempfile.mkdtemp()
     param_dict["save"] = d
 
-    @distributed_test(world_size=2)
-    def wrapper():
-        run_checkpoint_test(param_dict=param_dict)
+    t1 = test_run_checkpoint_test_class()
+    t1.run_checkpoint_test(param_dict=param_dict)
 
-    wrapper()
 
+class test_run_checkpoint_test_class(DistributedTest):
+    def run_checkpoint_test(yaml_list=None, param_dict=None):
 
-def run_checkpoint_test(yaml_list=None, param_dict=None):
+        from megatron.checkpointing import load_checkpoint
+        from megatron.checkpointing import save_checkpoint
 
-    from megatron.checkpointing import load_checkpoint
-    from megatron.checkpointing import save_checkpoint
-
-    model, optimizer, lr_scheduler, args_loaded = model_setup(
-        yaml_list, param_dict, clear_data=True
-    )
-
-    # save model checkpoint
-    save_checkpoint(
-        neox_args=args_loaded,
-        iteration=42,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-    )
-
-    # reload model from checkpoint
-    (
-        reloaded_model,
-        reloaded_optimizer,
-        reloaded_lr_scheduler,
-        args_reloaded,
-    ) = model_setup(yaml_list, param_dict, clear_data=False)
-    iteration = load_checkpoint(
-        neox_args=args_reloaded,
-        model=reloaded_model,
-        optimizer=reloaded_optimizer,
-        lr_scheduler=reloaded_lr_scheduler,
-    )
+        model, optimizer, lr_scheduler, args_loaded = model_setup(
+            yaml_list, param_dict, clear_data=True
+        )
 
-    # ensure same checkpoint is loaded
-    assert (
-        iteration == 42
-    ), "run_checkpoint_test() iteration loaded from checkpoint correct"
+        # save model checkpoint
+        save_checkpoint(
+            neox_args=args_loaded,
+            iteration=42,
+            model=model,
+            optimizer=optimizer,
+            lr_scheduler=lr_scheduler,
+        )
 
-    # check all weight groups are the same
-    for idx, ((n1, p1), (n2, p2)) in enumerate(
-        zip(
-            list(model.module.named_parameters()),
-            list(reloaded_model.module.named_parameters()),
+        # reload model from checkpoint
+        (
+            reloaded_model,
+            reloaded_optimizer,
+            reloaded_lr_scheduler,
+            args_reloaded,
+        ) = model_setup(yaml_list, param_dict, clear_data=False)
+        iteration = load_checkpoint(
+            neox_args=args_reloaded,
+            model=reloaded_model,
+            optimizer=reloaded_optimizer,
+            lr_scheduler=reloaded_lr_scheduler,
         )
-    ):
-        assert n1 == n2
-        params_equal = (p1 == p2).all().item()
-        assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
+
+        # ensure same checkpoint is loaded
+        assert (
+            iteration == 42
+        ), "run_checkpoint_test() iteration loaded from checkpoint correct"
+
+        # check all weight groups are the same
+        for idx, ((n1, p1), (n2, p2)) in enumerate(
+            zip(
+                list(model.module.named_parameters()),
+                list(reloaded_model.module.named_parameters()),
+            )
+        ):
+            assert n1 == n2
+            params_equal = (p1 == p2).all().item()
+            assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
 
 
 if __name__ == "__main__":
diff --git a/tests/model/test_model_generation.py b/tests/model/test_model_generation.py
index ab8bd756b..6dd93f355 100644
--- a/tests/model/test_model_generation.py
+++ b/tests/model/test_model_generation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 import os
 import pytest
-from tests.common import distributed_test, model_setup, parametrize
+from tests.common import DistributedTest, model_setup, parametrize
 
 PARAMS_TO_TEST = {
     "pipe_parallel_size,model_parallel_size,world_size": [
@@ -67,47 +67,47 @@
 @pytest.mark.skip
 @pytest.mark.parametrize("param_dict", parameters, ids=names)
 def test_train(param_dict):
-    @distributed_test(world_size=param_dict.pop("world_size", 2))
-    def wrapper():
-        run_generate_test(param_dict=param_dict, prompt=param_dict.pop("prompt"))
+    t1 = run_generate_test_class()
+    t1.run_generate_test(param_dict, param_dict.pop("prompt"))
 
-    wrapper()
 
+class run_generate_test_class(DistributedTest):
+    world_size = 2
 
-def run_generate_test(param_dict, prompt):
-    from megatron.text_generation_utils import generate_samples_from_prompt
-    from megatron.utils import is_mp_rank_0
+    def run_generate_test(param_dict, prompt):
+        from megatron.text_generation_utils import generate_samples_from_prompt
+        from megatron.utils import is_mp_rank_0
 
-    fixed_params = {
-        "num_samples": 3,
-        "maximum_tokens": 50,
-        "make_vocab_size_divisible_by": 2,
-        "sample_output_file": "test_sample_output.txt",
-        "checkpoint_activations": False,
-        "partition_activations": False,
-        "no_load_optim": True,
-    }
+        fixed_params = {
+            "num_samples": 3,
+            "maximum_tokens": 50,
+            "make_vocab_size_divisible_by": 2,
+            "sample_output_file": "test_sample_output.txt",
+            "checkpoint_activations": False,
+            "partition_activations": False,
+            "no_load_optim": True,
+        }
 
-    param_dict.update(fixed_params)
-    # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
-    model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
-    model.eval()
+        param_dict.update(fixed_params)
+        # TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
+        model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
+        model.eval()
 
-    prompts = [prompt for _ in range(args_loaded.num_samples)]
-    output = generate_samples_from_prompt(
-        neox_args=args_loaded,
-        model=model,
-        text=prompts,
-        maximum_tokens=args_loaded.maximum_tokens,
-        recompute=False,
-        temperature=args_loaded.temperature,
-        top_k=args_loaded.top_k,
-        top_p=args_loaded.top_p,
-    )
+        prompts = [prompt for _ in range(args_loaded.num_samples)]
+        output = generate_samples_from_prompt(
+            neox_args=args_loaded,
+            model=model,
+            text=prompts,
+            maximum_tokens=args_loaded.maximum_tokens,
+            recompute=False,
+            temperature=args_loaded.temperature,
+            top_k=args_loaded.top_k,
+            top_p=args_loaded.top_p,
+        )
 
-    # outputs only get generated on mp rank 0
-    if is_mp_rank_0():
-        assert len(output) == len(prompts)
-        for prompt, out in zip(prompts, output):
-            assert prompt == out["context"]
-            assert len(out["text"]) > 0
+        # outputs only get generated on mp rank 0
+        if is_mp_rank_0():
+            assert len(output) == len(prompts)
+            for prompt, out in zip(prompts, output):
+                assert prompt == out["context"]
+                assert len(out["text"]) > 0
diff --git a/tests/model/test_model_instantiation.py b/tests/model/test_model_instantiation.py
index ad57a2c5f..81c5cae4c 100644
--- a/tests/model/test_model_instantiation.py
+++ b/tests/model/test_model_instantiation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 import torch
 import os
 from tests.common import (
-    distributed_test,
+    DistributedTest,
     model_setup,
     clear_test_dirs,
     parametrize,
@@ -80,11 +80,8 @@
 )
 @pytest.mark.parametrize("param_dict", parameters, ids=names)
 def test_instantiate(param_dict):
-    @distributed_test(world_size=param_dict.pop("world_size", 2))
-    def wrapper():
-        run_test_model_instantiation(param_dict=param_dict)
-
-    wrapper()
+    t1 = test_instantiate_optimizers_class()
+    t1.run_test_model_instantiation(param_dict)
 
 
 OPTIMIZER_PARAMS = {
@@ -108,24 +105,24 @@ def wrapper():
 )
 @pytest.mark.parametrize("param_dict", opt_params, ids=opt_name)
 def test_instantiate_optimizers(param_dict):
-    @distributed_test(world_size=2)
-    def wrapper():
-        run_test_model_instantiation(param_dict=param_dict)
-
-    wrapper()
-
-
-def run_test_model_instantiation(yaml_list=None, param_dict=None):
-    from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine
-
-    model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
-    if args_loaded.pipe_parallel_size < 2:
-        assert isinstance(model, DeepSpeedEngine), "test model instantiation " + str(
-            yaml_list
-        )
-    else:
-        assert isinstance(model, PipelineEngine), "test model instantiation " + str(
-            yaml_list
-        )
-    if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
-        clear_test_dirs()
+    t1 = test_instantiate_optimizers_class()
+    t1.run_test_model_instantiation(param_dict)
+
+
+class test_instantiate_optimizers_class(DistributedTest):
+    world_size = 2
+
+    def run_test_model_instantiation(yaml_list=None, param_dict=None):
+        from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine
+
+        model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
+        if args_loaded.pipe_parallel_size < 2:
+            assert isinstance(
+                model, DeepSpeedEngine
+            ), "test model instantiation " + str(yaml_list)
+        else:
+            assert isinstance(model, PipelineEngine), "test model instantiation " + str(
+                yaml_list
+            )
+        if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
+            clear_test_dirs()
diff --git a/tests/model/test_model_train.py b/tests/model/test_model_train.py
index 0138a7f79..31798f342 100644
--- a/tests/model/test_model_train.py
+++ b/tests/model/test_model_train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,7 +38,6 @@
         "bigbird",
         "bslongformer",
         "gmlp",
-        "amlp",
         "flash",
     ],
     "hidden_dropout": [0, 0.1],
@@ -50,7 +49,10 @@
 
 keys_to_test = PARAMS_TO_TEST.keys()
 
-
+# TODO: fix model training tests
+@pytest.mark.skip(
+    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+)
 @pytest.mark.parametrize(
     "key, value",
     [(key, value) for key in keys_to_test for value in PARAMS_TO_TEST[key]],
diff --git a/tests/neox_args/test_neoxargs_commandline.py b/tests/neox_args/test_neoxargs_commandline.py
index cf39f75d0..0d3c7e5fb 100644
--- a/tests/neox_args/test_neoxargs_commandline.py
+++ b/tests/neox_args/test_neoxargs_commandline.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/neox_args/test_neoxargs_implementation.py b/tests/neox_args/test_neoxargs_implementation.py
index c77fa178f..176887c9c 100644
--- a/tests/neox_args/test_neoxargs_implementation.py
+++ b/tests/neox_args/test_neoxargs_implementation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/neox_args/test_neoxargs_load.py b/tests/neox_args/test_neoxargs_load.py
index 5f94f4fd6..f5d278112 100644
--- a/tests/neox_args/test_neoxargs_load.py
+++ b/tests/neox_args/test_neoxargs_load.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py
index 40532f23e..176151c2a 100644
--- a/tests/neox_args/test_neoxargs_usage.py
+++ b/tests/neox_args/test_neoxargs_usage.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/pytest.ini b/tests/pytest.ini
index c2bb21d77..6fd100ea2 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_configs/test_train_base.yml b/tests/test_configs/test_train_base.yml
index 28e41adbf..bb66a5b97 100644
--- a/tests/test_configs/test_train_base.yml
+++ b/tests/test_configs/test_train_base.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
    "scaled_upper_triang_masked_softmax_fusion": false,
    "bias_gelu_fusion": false,
    "rope_fusion": false,
+   "layernorm_fusion": false,
 
    # optimizer settings
    "optimizer": {
diff --git a/tests/unit/test_arguments.py b/tests/unit/test_arguments.py
index 501748366..b52a3b065 100644
--- a/tests/unit/test_arguments.py
+++ b/tests/unit/test_arguments.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from megatron.neox_arguments import NeoXArgs
-from tests.common import BASE_CONFIG, distributed_test
+from tests.common import BASE_CONFIG, DistributedTest
 
 
 def test_main_constructor():
@@ -24,18 +24,26 @@ def test_main_constructor():
     neox_args.configure_distributed_args()
 
 
-def test_constructor_from_ymls():
-    @distributed_test(world_size=[1, 2])
-    def _test_constructor_from_ymls():
+class test_constructor_from_ymls_class(DistributedTest):
+    world_size = 2
+
+    def test(self):
         neox_args = NeoXArgs.from_ymls(["tests/config/test_setup.yml"])
         neox_args.configure_distributed_args()
 
-    _test_constructor_from_ymls()
 
+def test_constructor_from_ymls():
+    t1 = test_constructor_from_ymls_class()
+    t1.test()
 
-def test_constructor_from_dict():
-    @distributed_test(world_size=[1, 2])
-    def _test_constructor_from_dict():
+
+class test_constructor_from_dict_class(DistributedTest):
+    world_size = 2
+
+    def test(self):
         neox_args = NeoXArgs.from_dict(BASE_CONFIG)
 
-    _test_constructor_from_dict()
+
+def test_constructor_from_dict():
+    t1 = test_constructor_from_dict_class()
+    t1.test()
diff --git a/tests/unit/test_format_conversion_scripts.py b/tests/unit/test_format_conversion_scripts.py
index 9c78c962d..e0801434c 100644
--- a/tests/unit/test_format_conversion_scripts.py
+++ b/tests/unit/test_format_conversion_scripts.py
@@ -1,12 +1,9 @@
 import pytest
-from tools.ckpts import convert_sequential_to_hf
+from tools.ckpts import convert_neox_to_hf
 from tests.common import simulate_deepy_env, save_random_model
 from megatron.neox_arguments.neox_args import NeoXArgsTokenizer
 
-# Test is failing; possibly we resolve by using the word embedding weights in the 'layer_00-model_00-model_states.pt' file?
-@pytest.mark.xfail(
-    reason="Failing to find 'word_embeddings.weight' in state_dict['module'] from the 'mp_rank_00_model_states.pt' file"
-)
+
 def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
     # Generate random GPT-NEOX model, check we can convert to hf format
     model_dir = str(tmpdir)
@@ -24,6 +21,4 @@ def test_gpt_neox_to_huggingface(monkeypatch, tmpdir, tmp_path):
         model_dir,
     ]
     overwrite_values = {"tokenizer_type": NeoXArgsTokenizer.tokenizer_type}
-    convert_sequential_to_hf.main(
-        input_args=script_args, overwrite_values=overwrite_values
-    )
+    convert_neox_to_hf.main(input_args=script_args, overwrite_values=overwrite_values)
diff --git a/tests/unit/test_launcher_scripts.py b/tests/unit/test_launcher_scripts.py
index 626788b59..bdc38f111 100644
--- a/tests/unit/test_launcher_scripts.py
+++ b/tests/unit/test_launcher_scripts.py
@@ -1,12 +1,13 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 import pytest
-from tools.datasets import preprocess_data
-import train
+
+import eval
 import generate
-import evaluate
+import train
 from megatron.neox_arguments import NeoXArgs
-from tests.common import simulate_deepy_env, save_random_model
+from tests.common import save_random_model, simulate_deepy_env
+from tools.datasets import preprocess_data
 
 
 @pytest.fixture(
@@ -55,6 +56,9 @@ def test_preprocess_data(tokenizer_type):
     preprocess_data.main(input_args)
 
 
+@pytest.mark.skip(
+    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file):
     model_dir = str(tmpdir)
     sample_output_file = str(tmp_path) + ".txt"
@@ -71,6 +75,9 @@ def test_generate(monkeypatch, tmpdir, tmp_path, sample_input_file):
     generate.main(input_args=deepspeed_main_args, overwrite_values=generate_args)
 
 
+@pytest.mark.skip(
+    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_evaluate(monkeypatch, tmpdir, tmp_path):
     model_dir = str(tmpdir)
     sample_output_file = str(tmp_path)
@@ -84,9 +91,12 @@ def test_evaluate(monkeypatch, tmpdir, tmp_path):
         "eval_tasks": ["lambada"],  # ["lambada", "hellaswag", "piqa", "sciq"],
         "eval_results_prefix": sample_output_file,
     }
-    evaluate.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args)
+    eval.main(input_args=deepspeed_main_args, overwrite_values=evaluate_args)
 
 
+@pytest.mark.skip(
+    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_finetuning(monkeypatch, tmpdir, tmp_path):
     # Save random model, load random model, keep training
     # TODO: add mocking to check that we're not ignoring the previously loaded model
@@ -101,6 +111,9 @@ def test_finetuning(monkeypatch, tmpdir, tmp_path):
     train.main(input_args=deepspeed_main_args, overwrite_values=finetune_args)
 
 
+@pytest.mark.skip(
+    reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
+)
 def test_train_launcher(monkeypatch):
     input_args = ["train.py", "tests/config/test_setup.yml"]
     deepspeed_main_args = simulate_deepy_env(monkeypatch, input_args)
diff --git a/tools/bash/sync.sh b/tools/bash/sync.sh
index d3788cce4..fd9377dbb 100755
--- a/tools/bash/sync.sh
+++ b/tools/bash/sync.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/bash/sync_cmd.sh b/tools/bash/sync_cmd.sh
index 6525247fb..2d8a617ca 100644
--- a/tools/bash/sync_cmd.sh
+++ b/tools/bash/sync_cmd.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/bash/syncdir.sh b/tools/bash/syncdir.sh
index 210d142ae..229c9af9a 100755
--- a/tools/bash/syncdir.sh
+++ b/tools/bash/syncdir.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/ckpts/convert_module_to_hf.py b/tools/ckpts/convert_module_to_hf.py
deleted file mode 100644
index c776a2817..000000000
--- a/tools/ckpts/convert_module_to_hf.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-import yaml
-import argparse
-from tqdm import tqdm
-from typing import List
-
-import torch
-from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
-
-
-sys.path.append(
-    os.path.abspath(
-        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
-    )
-)
-from megatron.tokenizer import build_tokenizer
-
-
-"""
-A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
-
-Note that this script does not support all NeoX features.
-Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
-
-(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture.
-"""
-
-
-def load_partitions(
-    input_checkpoint_path, mp_partitions, layer_idx
-) -> List[torch.Tensor]:
-    """Returns a list containing all weights in a given layer from a model (across MP partitions)"""
-
-    loaded_tp_ranks = [
-        torch.load(
-            os.path.join(
-                input_checkpoint_path,
-                f"layer_{layer_idx:02}-model_{i:02}-model_states.pt",
-            ),
-            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
-        )
-        for i in range(mp_partitions)
-    ]
-
-    return loaded_tp_ranks
-
-
-def get_key(loaded_config, key, default=None):
-    """
-    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
-    """
-    key = key.replace("_", "-")
-    try:
-        return loaded_config[key]
-    except KeyError:
-        key = key.replace("-", "_")
-        try:
-            return loaded_config[key]
-        except KeyError:
-            return default
-
-
-def create_config(neox_config):
-    """take in a loaded yaml from NeoX and assign relevant values to HF config.
-    Returns: GPTNeoXConfig() object
-    """
-
-    class TokenizerArgs:
-        # kinda hacky.
-        # this is to get something with the same interface as is used in build_tokenizer()
-        # without diving into loading a neox_args object or using argparse etc.
-        def __init__(self, neox_config):
-            self.make_vocab_size_divisible_by = get_key(
-                neox_config, "make-vocab-size-divisible-by", default=128
-            )
-            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
-            self.vocab_file = get_key(neox_config, "vocab-file")
-            self.merge_file = get_key(neox_config, "merge-file")
-            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
-
-            self.rank = 0
-
-    args = TokenizerArgs(neox_config)
-    tokenizer = build_tokenizer(args)
-    try:  # GPT2TokenizerFast raises NotImplementedError
-        pad_token = tokenizer.pad
-    except:
-        pad_token = (
-            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
-        )
-
-    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
-    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
-
-    if use_tied_lns:
-        raise NotImplementedError(
-            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
-                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
-                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
-        )
-
-    # set all config values.
-    hf_config = GPTNeoXConfig(
-        vocab_size=args.padded_vocab_size,
-        hidden_size=get_key(neox_config, "hidden-size"),
-        num_hidden_layers=get_key(neox_config, "num-layers"),
-        num_attention_heads=get_key(neox_config, "num-attention-heads"),
-        intermediate_size=(get_key(neox_config, "hidden-size") * 4),
-        hidden_act=get_key(neox_config, "activation", default="gelu"),
-        rotary_pct=get_key(neox_config, "rotary-pct", default=1.0),
-        rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000),
-        max_position_embeddings=get_key(neox_config, "max-position-embeddings"),
-        initializer_range=get_key(neox_config, "init-method-std", 0.02),
-        layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5),
-        use_cache=True,
-        bos_token_id=tokenizer.eod,
-        eos_token_id=tokenizer.eod,
-        tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)),
-        use_parallel_residual=get_key(neox_config, "gpt-j-residual", False),
-    )
-    return hf_config
-
-
-def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
-    """convert a NeoX checkpoint to a HF model format.
-    should perform model-parallel merging correctly
-    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
-    """
-
-    hf_config = GPTNeoXConfig()
-
-    hf_config = create_config(loaded_config)
-
-    hf_model = GPTNeoXForCausalLM(hf_config)
-
-    # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
-    fp16 = get_key(loaded_config, "fp16")
-    if fp16:
-        try:
-            # this conditional is quite messy because there were a number of ways to specify bf16 or fp16 training
-            # in DeeperSpeed v1.0 .
-            if (fp16.get("fp16", None) or fp16["enabled"]) and not (
-                fp16.get("type", None) == "bfloat16"
-            ):
-                hf_model.half()
-                print("Saving weights in fp16 precision...")
-            elif fp16.get("type", None) == "bfloat16":
-                hf_model.to(dtype=torch.bfloat16)
-                print("Saving weights in bf16 precision...")
-        except:
-            print(
-                "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
-            )
-
-    mp_partitions = get_key(loaded_config, "model-parallel-size")
-
-    ### Embedding layer ###
-    loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions, 0)
-    hf_model.gpt_neox.embed_in.load_state_dict(
-        {
-            "weight": torch.cat(
-                [t["word_embeddings.weight"] for t in loaded_tp_ranks], dim=0
-            )
-        }
-    )
-
-    assert (
-        hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0]
-    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}"
-    ### End Embedding Layer ###
-
-    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
-
-        # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_i]
-
-        # + 2 bc of embed layer and a dummy _pre_transformer_block
-        loaded_tp_ranks = load_partitions(
-            input_checkpoint_path, mp_partitions, layer_i + 2
-        )
-
-        state_dict = {}
-        for key in [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-        ]:
-            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=1)
-
-        # average layernorm stats over mp ranks
-        for key in [
-            "input_layernorm.weight",
-            "input_layernorm.bias",
-            "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias",
-        ]:
-            state_dict[key] = (sum([t[key] for t in loaded_tp_ranks])) / len(
-                loaded_tp_ranks
-            )
-
-        # LinearWithTPMerge
-        for key in [
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias",
-            "attention.query_key_value.weight",
-            "attention.query_key_value.bias",
-        ]:
-            state_dict[key] = torch.cat([t[key] for t in loaded_tp_ranks], dim=0)
-
-        # LinearWithTPSplitBias
-        for key in [
-            "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias",
-        ]:
-            state_dict[key] = sum([t[key] for t in loaded_tp_ranks]) / len(
-                loaded_tp_ranks
-            )
-
-        # Just take one
-        state_dict["attention.rotary_emb.inv_freq"] = loaded_tp_ranks[0][
-            "attention.rotary_emb.inv_freq"
-        ]
-        if "attention.bias" in hf_layer.state_dict():
-            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-        if "attention.masked_bias" in hf_layer.state_dict():
-            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
-                "attention.masked_bias"
-            ]
-
-        # load state_dict into layer
-        hf_layer.load_state_dict(state_dict)
-
-    # Load final layer norm
-    loaded_tp_ranks = load_partitions(
-        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 3
-    )
-
-    hf_model.gpt_neox.final_layer_norm.load_state_dict(
-        {
-            "weight": (sum([t["norm.weight"] for t in loaded_tp_ranks]))
-            / len(loaded_tp_ranks),
-            "bias": (sum([t["norm.bias"] for t in loaded_tp_ranks]))
-            / len(loaded_tp_ranks),
-        }
-    )
-    del loaded_tp_ranks
-
-    # Load output embedding
-    loaded_tp_ranks = load_partitions(
-        input_checkpoint_path, mp_partitions, get_key(loaded_config, "num-layers") + 4
-    )
-
-    hf_model.embed_out.load_state_dict(
-        {
-            "weight": torch.cat(
-                [t["final_linear.weight"] for t in loaded_tp_ranks], dim=0
-            ),
-        }
-    )
-
-    del loaded_tp_ranks
-
-    return hf_model
-
-
-if __name__ == "__main__":
-
-    # before running script:
-    # `pip install --upgrade transformers`
-    # `huggingface-cli login`
-    #
-    from huggingface_hub import create_repo, HfApi
-
-    parser = argparse.ArgumentParser(
-        description="Merge MP partitions and convert to HF Model."
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
-    )
-    parser.add_argument(
-        "--config_file",
-        type=str,
-        help="Path to config file for the input NeoX checkpoint.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="Output dir, where to save the HF Model, tokenizer, and configs",
-    )
-    parser.add_argument(
-        "--upload",
-        action="store_true",
-        help="Set to true in order to upload to the HF Hub directly.",
-    )
-    args = parser.parse_args()
-
-    with open(args.config_file) as f:
-        loaded_config = yaml.full_load(f)
-
-    hf_model = convert(args.input_dir, loaded_config, args.output_dir)
-
-    hf_model.save_pretrained(args.output_dir)
-
-    # save tokenizer to directory as well, for easy loading of model as a HF model
-    tokenizer_type = get_key(loaded_config, "tokenizer-type")
-
-    if tokenizer_type == "HFTokenizer":
-        print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
-        from transformers import PreTrainedTokenizerFast
-
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_file=get_key(loaded_config, "vocab-file")
-        )
-        print("loaded tokenizer: ", tokenizer)
-        tokenizer.save_pretrained(args.output_dir)
-        print("tokenizer saved!")
-
-    if args.upload:
-        repo_name = input("Provide a repository name for the HF Hub: ")
-        create_repo(repo_name, repo_type="model", private=False, use_auth_token=True)
-
-        api = HfApi()
-        api.upload_folder(
-            folder_path=args.output_dir,
-            repo_id=repo_name,
-            repo_type="model",
-        )
diff --git a/tools/ckpts/convert_neox_to_hf.py b/tools/ckpts/convert_neox_to_hf.py
new file mode 100644
index 000000000..6ad67f208
--- /dev/null
+++ b/tools/ckpts/convert_neox_to_hf.py
@@ -0,0 +1,728 @@
+# Copyright (c) 2023, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import yaml
+import argparse
+from tqdm import tqdm
+
+import torch
+from transformers import (
+    MistralConfig,
+    LlamaConfig,
+    GPTNeoXConfig,
+    AutoModelForCausalLM,
+    AutoConfig,
+)
+
+from typing import List, Literal
+
+sys.path.append(
+    os.path.abspath(
+        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
+    )
+)
+from megatron.tokenizer import build_tokenizer
+
+
+"""
+A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
+
+Note that this script does not support all NeoX features.
+Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
+
+(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture).
+"""
+
+
+# Model definitions: a list of keys, and where they fall in terms of handling them in the presence of TP.
+# in format : {model arch: {param type: {param in neox: param in HF}}}
+
+MODEL_KEYS = {
+    "neox": {
+        "COLUMN_PARALLEL_LINEAR_KEYS": {
+            "mlp.dense_h_to_4h.weight": "mlp.dense_h_to_4h.weight",
+            "mlp.dense_h_to_4h.bias": "mlp.dense_h_to_4h.bias",
+            "attention.query_key_value.weight": "attention.query_key_value.weight",
+            "attention.query_key_value.bias": "attention.query_key_value.bias",  # TODO: handle GQA separately?
+        },
+        "ROW_PARALLEL_LINEAR_KEYS": {
+            "attention.dense.weight": "attention.dense.weight",
+            "mlp.dense_4h_to_h.weight": "mlp.dense_4h_to_h.weight",
+        },
+        "ROW_PARALLEL_BIAS_KEYS": {
+            "mlp.dense_4h_to_h.bias": "mlp.dense_4h_to_h.bias",
+            "attention.dense.bias": "attention.dense.bias",
+        },
+        "NORM_KEYS": {
+            "input_layernorm.weight": "input_layernorm.weight",
+            "input_layernorm.bias": "input_layernorm.bias",
+            "post_attention_layernorm.weight": "post_attention_layernorm.weight",
+            "post_attention_layernorm.bias": "post_attention_layernorm.bias",
+        },
+        "FINAL_NORM_KEYS": {
+            "norm.weight": "weight",
+            "norm.bias": "bias",
+        },
+    },
+    "llama": {
+        "COLUMN_PARALLEL_LINEAR_KEYS": {
+            "mlp.w1.weight": "mlp.gate_proj.weight",
+            "mlp.w3.weight": "mlp.up_proj.weight",
+        },
+        "ROW_PARALLEL_LINEAR_KEYS": {
+            "attention.dense.weight": "self_attn.o_proj.weight",
+            "mlp.w2.weight": "mlp.down_proj.weight",
+        },
+        "ROW_PARALLEL_BIAS_KEYS": {},  # No biases in RowParallelLinear layers
+        "NORM_KEYS": {
+            "input_layernorm.scale": "input_layernorm.weight",
+            "post_attention_layernorm.scale": "post_attention_layernorm.weight",
+        },
+        "FINAL_NORM_KEYS": {
+            "norm.scale": "weight",
+        },
+        "GQA_QKV_KEYS": {  # because Llama can have Grouped Query Attention and has separate Q, K, and V linear proj params, handle them separately.
+            "attention.query_key_value.weight": [
+                "self_attn.q_proj.weight",
+                "self_attn.k_proj.weight",
+                "self_attn.v_proj.weight",
+            ],
+        },
+    },
+}
+
+MODEL_KEYS["mistral"] = MODEL_KEYS["llama"]
+
+
+def load_partitions(
+    input_checkpoint_path: str, mp_partitions: int, layer_idx: int, sequential: bool
+) -> List[torch.Tensor]:
+    """Returns a list containing all states from a model (across MP partitions)"""
+
+    if sequential:
+        filename_format = f"mp_rank_{{i:02}}_model_states.pt"
+    else:
+        filename_format = f"layer_{layer_idx:02}-model_{{i:02}}-model_states.pt"
+
+    loaded_tp_ranks = [
+        torch.load(
+            os.path.join(
+                input_checkpoint_path,
+                filename_format.format(i=i),
+            ),
+            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+        for i in range(mp_partitions)
+    ]
+
+    return loaded_tp_ranks
+
+
+def get_state(
+    state_dicts: List[torch.Tensor], key: str, layer_idx: int, sequential: bool
+) -> torch.Tensor:
+    """Helper that returns a list containing a given weight's state from each MP partition, for a given layer in the model."""
+
+    if sequential:
+        # use the correct key into the sequential dict for given weight/provided key
+        key = f"sequential.{layer_idx}.{key}"
+
+        return [state_dict["module"][key] for state_dict in state_dicts]
+    else:
+        # For the PipelineModule case, we don't need any key / module prefix. just grab this weight value.
+        # layer_idx is also ignored because we've loaded only this layer's weights, ahead of time.
+        key = key
+
+        return [state_dict[key] for state_dict in state_dicts]
+
+
+def get_key(loaded_config, key, default=None):
+    """
+    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
+    """
+    key = key.replace("_", "-")
+    try:
+        return loaded_config[key]
+    except KeyError:
+        key = key.replace("-", "_")
+        try:
+            return loaded_config[key]
+        except KeyError:
+            return default
+
+
+def create_config(neox_config, architecture="neox"):
+    """take in a loaded yaml from NeoX and assign relevant values to HF config.
+    Returns: GPTNeoXConfig() object
+    """
+
+    def gated_size(hidden_dim):
+        # takes in a hidden dim and calculates intermediate dim of a LLaMAParallelMLP.
+        # (only used if intermediate_size not specified in config)
+        # hidden-size * 8 / 3 , rounded up to nearest multiple of 256
+        ff_dim = int(2 * hidden_dim * 4 / 3)
+        ff_dim = 256 * ((ff_dim + 256 - 1) // 256)
+        return ff_dim
+
+    class TokenizerArgs:
+        # kinda hacky.
+        # this is to get something with the same interface as is used in build_tokenizer()
+        # without diving into loading a neox_args object or using argparse etc.
+        def __init__(self, neox_config):
+            self.make_vocab_size_divisible_by = get_key(
+                neox_config, "make-vocab-size-divisible-by", default=128
+            )
+            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
+            self.vocab_file = get_key(neox_config, "vocab-file")
+            self.merge_file = get_key(neox_config, "merge-file")
+            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
+
+            self.rank = 0
+
+    args = TokenizerArgs(neox_config)
+    tokenizer = build_tokenizer(args)
+    try:  # GPT2TokenizerFast raises NotImplementedError
+        pad_token = tokenizer.pad
+    except:
+        pad_token = (
+            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
+        )
+
+    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
+    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
+
+    if use_tied_lns:
+        raise NotImplementedError(
+            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
+                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
+                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
+        )
+
+    # set all config values.
+
+    # shared config parameters.
+    args = {
+        "vocab_size": args.padded_vocab_size,
+        "hidden_size": get_key(neox_config, "hidden-size"),
+        "num_hidden_layers": get_key(neox_config, "num-layers"),
+        "num_attention_heads": get_key(neox_config, "num-attention-heads"),
+        "max_position_embeddings": get_key(neox_config, "max-position-embeddings"),
+        "initializer_range": get_key(neox_config, "init-method-std", 0.02),
+        "tie_word_embeddings": (not get_key(neox_config, "no-weight-tying", False)),
+        "use_cache": True,
+    }
+    if architecture == "mistral" or architecture == "llama":
+        args.update(
+            {
+                "intermediate_size": get_key(
+                    neox_config,
+                    "intermediate-size",
+                    gated_size(get_key(neox_config, "hidden-size")),
+                ),
+                "num_key_value_heads": get_key(
+                    neox_config,
+                    "num-kv-heads",
+                    get_key(neox_config, "num-attention-heads"),
+                ),
+                "hidden_act": get_key(neox_config, "activation", default="silu"),
+                "rms_norm_eps": get_key(neox_config, "rms-norm-epsilon", 1.0e-6),
+                "bos_token_id": tokenizer.eod,
+                "eos_token_id": tokenizer.eod,
+                "rope_theta": get_key(neox_config, "rotary-emb-base", 10000.0),
+            }
+        )
+
+        if architecture == "mistral":
+            # mistral-specific options
+            args.update(
+                {
+                    "sliding_window": get_key(
+                        neox_config, "sliding-window-width", 4096
+                    ),
+                }
+            )
+            hf_config = MistralConfig(**args)
+        elif architecture == "llama":
+            # llama-specific options
+            args.update(
+                {
+                    # NeoX library defaults to using bias in attention
+                    "attention_bias": get_key(
+                        neox_config, "use_bias_in_attn_linear", True
+                    ),
+                }
+            )
+            hf_config = LlamaConfig(**args)
+    else:
+        # GPT-NeoX HF model class-specific options
+        args.update(
+            {
+                "rotary_pct": get_key(neox_config, "rotary-pct", default=1.0),
+                "rotary_emb_base": get_key(
+                    neox_config, "rotary-emb-base", default=1000.0
+                ),
+                "use_parallel_residual": get_key(neox_config, "gpt-j-residual", False),
+                "layer_norm_eps": get_key(neox_config, "layernorm-epsilon", 1e-5),
+            }
+        )
+        hf_config = GPTNeoXConfig(**args)
+
+    return hf_config
+
+
+def reshard_and_split_qkv(
+    param_mapping: dict,  # a dictionary mapping the QKV weight keys in GPT-NeoX -> a list of keys representing the Q, K, and V weight keys the HF model will use
+    hf_config: AutoConfig,  # a HF model config for the model
+    loaded_tp_ranks: List[torch.Tensor],
+    layer_idx: int,
+    sequential: bool,
+):
+    """
+    A helper function which performs reshaping and sharding to make the QKV projection from NeoX compatible with HF Llama models,
+    even when grouped-query attention is required.
+    """
+    for key, hf_keys in param_mapping.items():
+        assert (
+            isinstance(hf_keys, list) and len(hf_keys) == 3
+        ), "Must map QKV to precisely 3 resulting weight matrices."
+
+    for key, hf_keys in param_mapping.items():
+        # we first merge the QKV proj. across TP ranks
+        sharded_qkv = torch.stack(
+            get_state(loaded_tp_ranks, key, layer_idx, sequential), dim=0
+        )
+        # should now have shape [TP_SIZE, (hidden_size + 2 * kv_hidden_size) / TP_SIZE, hidden_size].
+
+        sharded_qkv = sharded_qkv.view(
+            len(loaded_tp_ranks),
+            hf_config.num_attention_heads // len(loaded_tp_ranks),
+            int(
+                hf_config.hidden_size
+                // hf_config.num_attention_heads
+                * (
+                    1
+                    + 2 * hf_config.num_key_value_heads / hf_config.num_attention_heads
+                )
+            ),
+            hf_config.hidden_size,
+        )  # is meant to convert to shape [TP_SIZE, NUM_QUERY_HEADS_PER_SHARD, dims_per_head * (1 + 2 * kv-to-q head ratio), hidden_size]
+
+        q, k, v = torch.split(
+            sharded_qkv,
+            [
+                hf_config.hidden_size // hf_config.num_attention_heads,
+                int(
+                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
+                    * hf_config.hidden_size
+                    // hf_config.num_attention_heads
+                ),
+                int(
+                    (hf_config.num_key_value_heads / hf_config.num_attention_heads)
+                    * hf_config.hidden_size
+                    // hf_config.num_attention_heads
+                ),
+            ],
+            dim=2,
+        )
+        # splits along the (dims_per_head * (1 + 2 * kv-to-q head ratio)_ dim to get 3 tensors:
+        # 1 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, dims_per_head, hidden_size] and 2 x [TP_SIZE, NUM_Q_HEADS_PER_SHARD, (dims_per_head / kv-to-q head ratio), hidden_size]
+        # these are the Q, and K, V tensors respectively.
+
+        # we have to do additional reshape for each individual tensor now,
+        # into the expected square (or smaller than square, for K/V tensors) shape
+        q, k, v = q.squeeze(dim=2), k.squeeze(dim=2), v.squeeze(dim=2)
+        q = q.view(
+            hf_config.num_attention_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(hf_config.hidden_size, hf_config.hidden_size)
+        k = k.reshape(
+            hf_config.num_key_value_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(
+            hf_config.hidden_size
+            // hf_config.num_attention_heads
+            * hf_config.num_key_value_heads,
+            hf_config.hidden_size,
+        )
+        v = v.reshape(
+            hf_config.num_key_value_heads,
+            hf_config.hidden_size // hf_config.num_attention_heads,
+            hf_config.hidden_size,
+        ).reshape(
+            hf_config.hidden_size
+            // hf_config.num_attention_heads
+            * hf_config.num_key_value_heads,
+            hf_config.hidden_size,
+        )
+
+        # return these
+        state_dict = {}
+        for hf_key, proj in zip(hf_keys, [q, k, v]):
+            state_dict[hf_key] = proj.clone()
+        return state_dict
+
+
+def convert(
+    input_checkpoint_path,
+    loaded_config,
+    output_checkpoint_path,
+    sequential: bool = True,
+    precision: Literal["auto", "fp16", "bf16", "fp32"] = "auto",
+    architecture: Literal["neox", "llama", "mistral"] = "neox",
+):
+    """convert a NeoX checkpoint to a HF model format.
+    should perform model-parallel merging correctly
+    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
+    """
+
+    ARCH = MODEL_KEYS[architecture]
+
+    hf_config = create_config(loaded_config, architecture=architecture)
+
+    hf_model = AutoModelForCausalLM.from_config(hf_config)
+
+    if architecture == "neox":
+        hf_transformer = hf_model.gpt_neox
+    else:
+        hf_transformer = hf_model.model
+
+    if precision == "auto":
+        print("Auto-detecting precision to save model into...")
+        # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
+        fp16 = get_key(loaded_config, "fp16")
+
+        if fp16:
+            try:
+                # current behavior is to pass "fp16": {"enabled": true}, when using upstream Deepspeed
+                if fp16["enabled"]:
+                    hf_model.half()
+                    print("Saving weights in fp16 precision...")
+            except:
+                try:
+                    # attempt to access bf16 dict in yaml file, if fp16 not enabled
+                    bf16 = get_key(loaded_config, "bf16")
+                    if bf16:
+                        hf_model.to(dtype=torch.bfloat16)
+                        print("Saving weights in bf16 precision...")
+                except:
+                    hf_model.to(dtype=torch.float)
+                    print(
+                        "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
+                    )
+    else:
+        name_to_dtype = {
+            "bf16": torch.bfloat16,
+            "fp16": torch.float16,
+            "fp32": torch.float,
+        }
+        print(f"Saving model into specified {precision} precision...")
+        hf_model.to(dtype=name_to_dtype[precision])
+
+    mp_partitions = get_key(loaded_config, "model-parallel-size")
+
+    # Sequential saves all model states from an MP rank in one file.
+    # so we only load the MP ranks only once and index into them with get_state().
+    # for the pipeline-parallel case (pipeline-parallel-size >= 1),
+    # we must load the correct layer's states at each step.
+    # (this does mean that less memory is required for PP conversion.)
+    loaded_tp_ranks = load_partitions(
+        input_checkpoint_path, mp_partitions, layer_idx=0, sequential=sequential
+    )
+
+    ### Embedding layer ###
+    # Embedding is layer idx 0
+    if architecture == "neox":
+        embed_in = hf_transformer.embed_in
+    else:
+        embed_in = hf_transformer.embed_tokens
+    embed_in.load_state_dict(  # TODO: embed_in is not always model's name for embedding
+        {
+            "weight": torch.cat(
+                get_state(
+                    loaded_tp_ranks,
+                    "word_embeddings.weight",
+                    layer_idx=0,
+                    sequential=sequential,
+                ),
+                dim=0,
+            )
+        }
+    )
+    assert (
+        hf_config.vocab_size == embed_in.weight.shape[0]
+    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {embed_in.shape[0]}"
+    ### End Embedding Layer ###
+
+    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
+
+        # get layer from hf model
+        hf_layer = hf_transformer.layers[layer_i]  # TODO: model module names
+
+        if not sequential:
+            # in the non-sequential case, must load from each layer individually.
+            # use layer index + 2 bc of embed layer and a dummy _pre_transformer_block, which are "layers 0 and 1"
+            loaded_tp_ranks = load_partitions(
+                input_checkpoint_path,
+                mp_partitions,
+                layer_idx=layer_i + 2,
+                sequential=sequential,
+            )
+
+        # + 2 bc of embed layer and a dummy _pre_transformer_block
+        state_dict = {}
+        for key, hf_key in ARCH["ROW_PARALLEL_LINEAR_KEYS"].items():
+            state_dict[hf_key] = torch.cat(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                ),
+                dim=1,
+            )
+
+        # average layernorm stats over mp ranks
+        for key, hf_key in ARCH["NORM_KEYS"].items():
+            state_dict[hf_key] = sum(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                )
+            ) / len(loaded_tp_ranks)
+
+        # LinearWithTPMerge
+        for key, hf_key in ARCH["COLUMN_PARALLEL_LINEAR_KEYS"].items():
+            state_dict[hf_key] = torch.cat(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                ),
+                dim=0,
+            )
+
+        # LinearWithTPSplitBias
+        for key, hf_key in ARCH["ROW_PARALLEL_BIAS_KEYS"].items():
+            state_dict[hf_key] = sum(
+                get_state(
+                    loaded_tp_ranks, key, layer_idx=layer_i + 2, sequential=sequential
+                )
+            )
+
+        # Just take one
+        if "attention.rotary_emb.inv_freq" in hf_layer.state_dict():
+            state_dict["attention.rotary_emb.inv_freq"] = get_state(
+                loaded_tp_ranks,
+                "attention.rotary_emb.inv_freq",
+                layer_idx=layer_i + 2,
+                sequential=sequential,
+            )[0]
+
+        if "attention.bias" in hf_layer.state_dict():
+            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
+        if "attention.masked_bias" in hf_layer.state_dict():
+            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
+                "attention.masked_bias"
+            ]
+
+        # some architectures, like Mistral and Llama, have the following which must be handled specially:
+        # - Q, K, V projections are performed separately, so we must split apart GPT-NeoX library's single QKV proj
+        # - Support for Grouped-Query Attention, meaning the Q and the K, V projections may not be the same size
+        if "GQA_QKV_KEYS" in ARCH:
+            state_dict.update(
+                reshard_and_split_qkv(
+                    param_mapping=ARCH["GQA_QKV_KEYS"],
+                    hf_config=hf_config,
+                    loaded_tp_ranks=loaded_tp_ranks,
+                    layer_idx=layer_i + 2,
+                    sequential=sequential,
+                )
+            )
+        # load state_dict into layer
+        hf_layer.load_state_dict(state_dict)
+
+    if not sequential:
+        loaded_tp_ranks = load_partitions(
+            input_checkpoint_path,
+            mp_partitions,
+            get_key(loaded_config, "num-layers") + 3,
+            sequential=sequential,
+        )
+    # Load final layer norm
+    if architecture == "neox":
+        lm_head = hf_model.embed_out
+    else:
+        lm_head = hf_model.lm_head
+    norm_state_dict = {}
+    for key, hf_key in ARCH["FINAL_NORM_KEYS"].items():
+        norm_state_dict[hf_key] = sum(
+            get_state(
+                loaded_tp_ranks,
+                key,
+                layer_idx=get_key(loaded_config, "num-layers") + 3,
+                sequential=sequential,
+            )
+        ) / len(loaded_tp_ranks)
+
+    if architecture == "neox":
+        final_layer_norm = hf_transformer.final_layer_norm
+    else:
+        final_layer_norm = hf_transformer.norm
+
+    final_layer_norm.load_state_dict(norm_state_dict)
+
+    # Load output embedding
+    if not sequential:
+        loaded_tp_ranks = load_partitions(
+            input_checkpoint_path,
+            mp_partitions,
+            get_key(loaded_config, "num-layers") + 4,
+            sequential=sequential,
+        )
+    # output embedding / LM head
+    if architecture == "neox":  # name of lm head / final linear proj varies
+        lm_head = hf_model.embed_out
+    else:
+        lm_head = hf_model.lm_head
+    lm_head.load_state_dict(
+        {
+            "weight": torch.cat(
+                get_state(
+                    loaded_tp_ranks,
+                    "final_linear.weight",
+                    layer_idx=get_key(loaded_config, "num-layers") + 4,
+                    sequential=sequential,
+                ),
+                dim=0,
+            ),
+        }
+    )
+
+    del loaded_tp_ranks
+
+    return hf_model
+
+
+def main(input_args=None, overwrite_values=None):
+    from huggingface_hub import create_repo, HfApi
+
+    parser = argparse.ArgumentParser(
+        description="Merge MP partitions and convert to HF Model."
+    )
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
+    )
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to config file for the input NeoX checkpoint.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Output dir, where to save the HF Model, tokenizer, and configs",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="auto",
+        help="What precision to save the model into. Defaults to auto, which auto-detects which 16-bit dtype to save into, or falls back to fp32.",
+    )
+    parser.add_argument(
+        "--no_save_tokenizer",
+        action="store_true",
+        help="Whether to skip saving the tokenizer alongside a model.",
+    )
+    parser.add_argument(
+        "--architecture",
+        type=str,
+        default="neox",
+        help="What HF model class type to export into.",
+    )
+    args = parser.parse_args(input_args)
+
+    # validate arguments
+    assert args.precision in [
+        "auto",
+        "fp16",
+        "bf16",
+        "fp32",
+    ], f"expected --precision to be one of 'auto', 'fp16', 'bf16', 'fp32' but got '{args.precision}' !"
+    assert args.architecture in [
+        "neox",
+        "llama",
+        "mistral",
+    ], f"expected --architecture to be one of 'neox', 'mistral', 'llama', but got '{args.architecture}' !"
+
+    with open(args.config_file) as f:
+        loaded_config = yaml.full_load(f)
+        if overwrite_values:
+            loaded_config.update(overwrite_values)
+
+    # Determine the checkpoint format of the model.
+    # DeepSpeed saves models wrapped in a PipelineModule differently from those not.
+    # PipelineModule models are saved as per-layer state dicts per TP shard,
+    # while Sequential model state dicts are saved all together in one mp_rank_xx_model_states.pt
+    # file per tensor/model parallel shard.
+    pipeline_world_size = get_key(loaded_config, "pipe-parallel-size", 1)
+    if pipeline_world_size == 0:
+        sequential = True
+        print(
+            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as Sequential..."
+        )
+    else:
+        sequential = False
+        print(
+            f"Detected 'pipe-parallel-size' of {pipeline_world_size}, assuming model is saved as PipelineModule..."
+        )
+
+    # convert the model to HF.
+    hf_model = convert(
+        args.input_dir,
+        loaded_config,
+        args.output_dir,
+        sequential=sequential,
+        architecture=args.architecture,
+    )
+
+    # Save to disk.
+    hf_model.save_pretrained(args.output_dir)
+
+    if not args.no_save_tokenizer:
+        # save tokenizer to directory as well, for easy loading of model as a HF model.
+        tokenizer_type = get_key(loaded_config, "tokenizer-type")
+
+        if tokenizer_type == "HFTokenizer":  # TODO: handle sentencepiece tokenizers?
+            print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
+            print(
+                "Warning: please check that your model config and tokenizer end with the correct special tokens (EOS, BOS)."
+            )
+            from transformers import PreTrainedTokenizerFast
+
+            tokenizer = PreTrainedTokenizerFast(
+                tokenizer_file=get_key(loaded_config, "vocab-file")
+            )
+            print("loaded tokenizer: ", tokenizer)
+            tokenizer.save_pretrained(args.output_dir)
+            print("tokenizer saved!")
+
+
+if __name__ == "__main__":
+
+    # before running script:
+    # `pip install --upgrade transformers`
+    # `huggingface-cli login`
+    #
+    main()
diff --git a/tools/ckpts/convert_raw_llama_weights_to_neox.py b/tools/ckpts/convert_raw_llama_weights_to_neox.py
index 5940fa856..a28f1d0fb 100644
--- a/tools/ckpts/convert_raw_llama_weights_to_neox.py
+++ b/tools/ckpts/convert_raw_llama_weights_to_neox.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,13 +24,19 @@
     "7B": 11008,
     "13B": 13824,
     "30B": 17920,
+    "34B": 22016,
     "65B": 22016,
+    "70B": 28672,
+    "mistral-7B-v0.1": 14336,
 }
 NUM_SHARDS = {
     "7B": 1,
     "13B": 2,
     "30B": 4,
+    "34B": 4,
     "65B": 8,
+    "70B": 8,
+    "mistral-7B-v0.1": 1,
 }
 
 
@@ -66,19 +72,30 @@ def convert_model_pipeline(
     num_input_shards = NUM_SHARDS[model_size]
     num_layers = params["n_layers"]
     num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
     num_heads_per_input_shard = num_heads // num_input_shards
     num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
     hidden_size = params["dim"]
     dims_per_head = hidden_size // num_heads
     # base = 10000.0
     # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
     def permute_rotary(w):
-        assert w.shape == (num_heads, dims_per_head, hidden_size)
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
         return (
-            w.view(num_heads, dims_per_head // 2, 2, hidden_size)
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
             .transpose(1, 2)
-            .reshape(num_heads, dims_per_head, hidden_size)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
         )
 
     pbar = tqdm.tqdm(total=num_input_shards + num_layers + 3)
@@ -112,6 +129,7 @@ def permute_rotary(w):
         ],
         dim=1,
     )
+    print(embeddings_in.shape)
     helper.save_shards(
         {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
     )
@@ -143,6 +161,14 @@ def permute_rotary(w):
     if model_size == "7B":
         rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
         helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    elif "mistral" in model_size:
+        # mistral does not include rope freqs in the distributed checkpoint, unlike llama.
+        # rather than making this buffer always non-persistent on the NeoX side,
+        # just create and save it for Mistral.
+        base = 10000.0
+        rope_freqs = 1.0 / (
+            base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+        )
     else:
         rope_freqs = loaded[0]["rope.freqs"]
         helper.del_loaded("rope.freqs")
@@ -210,23 +236,25 @@ def permute_rotary(w):
             torch.cat(
                 [
                     loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
                     )
                     for rank in range(num_input_shards)
                 ],
                 dim=0,
             )
-        )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
         w_v = torch.cat(
             [
                 loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_heads_per_input_shard, dims_per_head, hidden_size
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
                 )
                 for rank in range(num_input_shards)
             ],
             dim=0,
-        )
-        sharded_qkv = torch.stack(
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        sharded_qkv = torch.cat(
             [
                 helper.shard(
                     w_q, dim=0
@@ -236,9 +264,11 @@ def permute_rotary(w):
             ],
             dim=2,
         )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
         sharded_qkv = sharded_qkv.view(
             num_output_shards,
-            num_heads_per_output_shard * 3 * dims_per_head,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
             hidden_size,
         )
         helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
@@ -301,19 +331,30 @@ def convert_model_sequential(
     num_input_shards = NUM_SHARDS[model_size]
     num_layers = params["n_layers"]
     num_heads = params["n_heads"]
+    if "n_kv_heads" in params:
+        num_kv_heads = params["n_kv_heads"]
+    else:
+        num_kv_heads = num_heads
+    num_kv_heads_per_input_shard = num_kv_heads // num_input_shards
     num_heads_per_input_shard = num_heads // num_input_shards
     num_heads_per_output_shard = num_heads // num_output_shards
+    num_kv_heads_per_output_shard = num_kv_heads // num_output_shards
     hidden_size = params["dim"]
     dims_per_head = hidden_size // num_heads
     # base = 10000.0
     # inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
     def permute_rotary(w):
-        assert w.shape == (num_heads, dims_per_head, hidden_size)
+        if w.shape == (num_heads, dims_per_head, hidden_size):
+            N_HEADS = num_heads
+        elif w.shape == (num_kv_heads, dims_per_head, hidden_size):
+            N_HEADS = num_kv_heads
+        else:
+            assert False
         return (
-            w.view(num_heads, dims_per_head // 2, 2, hidden_size)
+            w.view(N_HEADS, dims_per_head // 2, 2, hidden_size)
             .transpose(1, 2)
-            .reshape(num_heads, dims_per_head, hidden_size)
+            .reshape(N_HEADS, dims_per_head, hidden_size)
         )
 
     pbar = tqdm.tqdm(total=num_input_shards + num_output_shards)
@@ -345,6 +386,7 @@ def permute_rotary(w):
         ],
         dim=1,
     )
+
     helper.add_sequential_shard(
         {"word_embeddings.weight": helper.shard(embeddings_in, dim=0)}, layer_i=0
     )
@@ -370,6 +412,14 @@ def permute_rotary(w):
     if model_size == "7B":
         rope_freqs = loaded[0]["layers.0.attention.inner_attention.rope.freqs"]
         helper.del_loaded("layers.0.attention.inner_attention.rope.freqs")
+    elif "mistral" in model_size:
+        # mistral does not include rope freqs in the distributed checkpoint, unlike llama.
+        # rather than making this buffer always non-persistent on the NeoX side,
+        # just create and save it for Mistral.
+        base = 10000.0
+        rope_freqs = 1.0 / (
+            base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
+        )
     else:
         rope_freqs = loaded[0]["rope.freqs"]
         helper.del_loaded("rope.freqs")
@@ -433,27 +483,30 @@ def permute_rotary(w):
                 dim=0,
             )
         )
+
         w_k = permute_rotary(
             torch.cat(
                 [
                     loaded[rank][f"layers.{layer_i}.attention.wk.weight"].view(
-                        num_heads_per_input_shard, dims_per_head, hidden_size
+                        num_kv_heads_per_input_shard, dims_per_head, hidden_size
                     )
                     for rank in range(num_input_shards)
                 ],
                 dim=0,
             )
-        )
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
         w_v = torch.cat(
             [
                 loaded[rank][f"layers.{layer_i}.attention.wv.weight"].view(
-                    num_heads_per_input_shard, dims_per_head, hidden_size
+                    num_kv_heads_per_input_shard, dims_per_head, hidden_size
                 )
                 for rank in range(num_input_shards)
             ],
             dim=0,
-        )
-        sharded_qkv = torch.stack(
+        ).view(num_heads, int(dims_per_head * (num_kv_heads / num_heads)), hidden_size)
+
+        sharded_qkv = torch.cat(
             [
                 helper.shard(
                     w_q, dim=0
@@ -463,11 +516,14 @@ def permute_rotary(w):
             ],
             dim=2,
         )  # num_output_shards, num_heads_per_output_shard, QKV=3, dims_per_head, hidden_size
+
         sharded_qkv = sharded_qkv.view(
             num_output_shards,
-            num_heads_per_output_shard * 3 * dims_per_head,
+            num_heads_per_output_shard * dims_per_head
+            + 2 * num_kv_heads_per_output_shard * dims_per_head,
             hidden_size,
         )
+
         helper.del_loaded(f"layers.{layer_i}.attention.wq.weight")
         helper.del_loaded(f"layers.{layer_i}.attention.wk.weight")
         helper.del_loaded(f"layers.{layer_i}.attention.wv.weight")
@@ -548,7 +604,7 @@ def save_duplicates(self, dictionary, layer_i: int):
             )
 
     def save(self, obj, layer_i, rank):
-        torch.save(obj, self.save_path(layer_i=layer_i + 2, rank=rank))
+        torch.save(obj, self.save_path(layer_i=layer_i, rank=rank))
 
     def shard(self, x, dim):
         x_shape = list(x.shape)
@@ -588,19 +644,19 @@ def add_sequential(self, dictionary, layer_i, rank):
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Convert raw LLaMA checkpoints to GPT-NeoX format."
+        description="Convert raw LLaMA or Mistral checkpoints to GPT-NeoX format."
     )
     parser.add_argument(
         "--input_dir",
-        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+        help="Location of parent directory, which contains tokenizer.model and model weights subfolders",
     )
     parser.add_argument(
         "--model_size",
-        choices=["7B", "13B", "30B", "65B", "tokenizer_only"],
+        choices=["7B", "mistral-7B-v0.1", "13B", "30B", "34B", "65B", "tokenizer_only"],
     )
     parser.add_argument(
         "--output_dir",
-        help="Location to write GPT-NeoX mode",
+        help="Location to write GPT-NeoX model",
     )
     parser.add_argument(
         "--num_output_shards",
diff --git a/tools/ckpts/convert_sequential_to_hf.py b/tools/ckpts/convert_sequential_to_hf.py
deleted file mode 100644
index f40a1d233..000000000
--- a/tools/ckpts/convert_sequential_to_hf.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright (c) 2023, EleutherAI
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-import yaml
-import argparse
-from tqdm import tqdm
-
-import torch
-from transformers import GPTNeoXConfig, GPTNeoXForCausalLM
-
-from typing import List
-
-sys.path.append(
-    os.path.abspath(
-        os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)
-    )
-)
-from megatron.tokenizer import build_tokenizer
-
-
-"""
-A script for converting saved NeoX Checkpoints to Huggingface (HF) compatible GPT-NeoX type models.
-
-Note that this script does not support all NeoX features.
-Please investigate carefully whether your model is compatible with all architectures supported by the GPTNeoXForCausalLM class in HF.
-
-(e.g. position embeddings such as AliBi may not be supported by Huggingface's GPT-NeoX architecture.
-"""
-
-
-def load_partitions(input_checkpoint_path, mp_partitions) -> List[torch.Tensor]:
-    """Returns a list containing all states from a model (across MP partitions)"""
-
-    loaded_tp_ranks = [
-        torch.load(
-            os.path.join(
-                input_checkpoint_path,
-                f"mp_rank_{i:02}_model_states.pt",
-            ),
-            map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
-        )
-        for i in range(mp_partitions)
-    ]
-
-    return loaded_tp_ranks
-
-
-def get_state(
-    state_dicts: List[torch.Tensor],
-    key: str,
-    layer_idx: int,
-) -> torch.Tensor:
-    """Accesses all MP partitions of a given layer/weight's state."""
-    # main DeepSpeed saves each MP partition
-    key = f"sequential.{layer_idx}.{key}"
-
-    return [state_dict["module"][key] for state_dict in state_dicts]
-
-
-def get_key(loaded_config, key, default=None):
-    """
-    Search for a given key in a NeoX yaml. normalizes underscores -> hyphens
-    """
-    key = key.replace("_", "-")
-    try:
-        return loaded_config[key]
-    except KeyError:
-        key = key.replace("-", "_")
-        try:
-            return loaded_config[key]
-        except KeyError:
-            return default
-
-
-def create_config(neox_config):
-    """take in a loaded yaml from NeoX and assign relevant values to HF config.
-    Returns: GPTNeoXConfig() object
-    """
-
-    class TokenizerArgs:
-        # kinda hacky.
-        # this is to get something with the same interface as is used in build_tokenizer()
-        # without diving into loading a neox_args object or using argparse etc.
-        def __init__(self, neox_config):
-            self.make_vocab_size_divisible_by = get_key(
-                neox_config, "make-vocab-size-divisible-by", default=128
-            )
-            self.model_parallel_size = get_key(neox_config, "model-parallel-size")
-            self.vocab_file = get_key(neox_config, "vocab-file")
-            self.merge_file = get_key(neox_config, "merge-file")
-            self.tokenizer_type = get_key(neox_config, "tokenizer-type")
-
-            self.rank = 0
-
-    args = TokenizerArgs(neox_config)
-    tokenizer = build_tokenizer(args)
-    try:  # GPT2TokenizerFast raises NotImplementedError
-        pad_token = tokenizer.pad
-    except:
-        pad_token = (
-            1  # pad defaulting to 1. follows convention from GPT-NeoX-20b tokenizer
-        )
-
-    # TODO: change the default value here based on discussion regarding `gpt_j_tied` config parameter's default
-    use_tied_lns = get_key(neox_config, "gpt-j-tied", False)
-
-    if use_tied_lns:
-        raise NotImplementedError(
-            """ERROR: Huggingface Transformers does not yet support a single shared layernorm
-                per transformer block for GPT-NeoX models trained  w/ GPT-J parallel residuals.
-                See https://github.com/EleutherAI/gpt-neox/pull/481 for further details."""
-        )
-
-    # set all config values.
-    hf_config = GPTNeoXConfig(
-        vocab_size=args.padded_vocab_size,
-        hidden_size=get_key(neox_config, "hidden-size"),
-        num_hidden_layers=get_key(neox_config, "num-layers"),
-        num_attention_heads=get_key(neox_config, "num-attention-heads"),
-        intermediate_size=(get_key(neox_config, "hidden-size") * 4),
-        hidden_act=get_key(neox_config, "activation", default="gelu"),
-        rotary_pct=get_key(neox_config, "rotary-pct", default=1.0),
-        rotary_emb_base=get_key(neox_config, "rotary-emb-base", default=10000),
-        max_position_embeddings=get_key(neox_config, "max-position-embeddings"),
-        initializer_range=get_key(neox_config, "init-method-std", 0.02),
-        layer_norm_eps=get_key(neox_config, "layernorm-epsilon", 1e-5),
-        use_cache=True,
-        bos_token_id=tokenizer.eod,
-        eos_token_id=tokenizer.eod,
-        tie_word_embeddings=(not get_key(neox_config, "no-weight-tying", False)),
-        use_parallel_residual=get_key(neox_config, "gpt-j-residual", False),
-    )
-    return hf_config
-
-
-def convert(input_checkpoint_path, loaded_config, output_checkpoint_path):
-    """convert a NeoX checkpoint to a HF model format.
-    should perform model-parallel merging correctly
-    but only supports features allowed by HF GPT-NeoX implementation (e.g. rotary embeddings)
-    """
-
-    hf_config = create_config(loaded_config)
-
-    hf_model = GPTNeoXForCausalLM(hf_config)
-
-    # save model in FP16 if Deepspeed fp16 was used in config, else 32 bit
-    fp16 = get_key(loaded_config, "fp16")
-    # save model in fp16/bf16 if Deepspeed fp16 or bf16 mixed precision was used in config, else 32 bit weights
-    fp16 = get_key(loaded_config, "fp16")
-    if fp16:
-        try:
-            # current behavior is to pass "fp16": {"enabled": true}, when using upstream Deepspeed
-            if fp16["enabled"]:
-                hf_model.half()
-                print("Saving weights in fp16 precision...")
-        except:
-            try:
-                # attempt to access bf16 dict in yaml file, if fp16 not enabled
-                bf16 = get_key(loaded_config, "bf16")
-                if bf16:
-                    hf_model.to(dtype=torch.bfloat16)
-                    print("Saving weights in bf16 precision...")
-            except:
-                print(
-                    "Model not trained in fp16 / bf16 mixed precision, saving weights in fp32..."
-                )
-
-    mp_partitions = get_key(loaded_config, "model-parallel-size")
-
-    # DeepSpeed main saves all model states from an MP rank in one file. load the MP ranks only once and index into them with get_state()
-    loaded_tp_ranks = load_partitions(input_checkpoint_path, mp_partitions)
-
-    ### Embedding layer ###
-    # Embedding is layer idx 0
-    hf_model.gpt_neox.embed_in.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(loaded_tp_ranks, "word_embeddings.weight", 0), dim=0
-            )
-        }
-    )
-    assert (
-        hf_config.vocab_size == hf_model.gpt_neox.embed_in.weight.shape[0]
-    ), f"ERROR: calculated vocab size {hf_config.vocab_size} != embed param size {hf_model.gpt_neox.embed_in.shape[0]}"
-    ### End Embedding Layer ###
-
-    for layer_i in tqdm(range(get_key(loaded_config, "num-layers"))):
-
-        # get layer from hf model
-        hf_layer = hf_model.gpt_neox.layers[layer_i]
-
-        # + 2 bc of embed layer and a dummy _pre_transformer_block
-        state_dict = {}
-        for key in [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-        ]:
-            state_dict[key] = torch.cat(
-                get_state(loaded_tp_ranks, key, layer_i + 2), dim=1
-            )
-
-        # average layernorm stats over mp ranks
-        for key in [
-            "input_layernorm.weight",
-            "input_layernorm.bias",
-            "post_attention_layernorm.weight",
-            "post_attention_layernorm.bias",
-        ]:
-            state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2)) / len(
-                loaded_tp_ranks
-            )
-
-        # LinearWithTPMerge
-        for key in [
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias",
-            "attention.query_key_value.weight",
-            "attention.query_key_value.bias",
-        ]:
-            state_dict[key] = torch.cat(
-                get_state(loaded_tp_ranks, key, layer_i + 2), dim=0
-            )
-
-        # LinearWithTPSplitBias
-        for key in [
-            "mlp.dense_4h_to_h.bias",
-            "attention.dense.bias",
-        ]:
-            state_dict[key] = sum(get_state(loaded_tp_ranks, key, layer_i + 2)) / len(
-                loaded_tp_ranks
-            )
-
-        # Just take one
-        state_dict["attention.rotary_emb.inv_freq"] = get_state(
-            loaded_tp_ranks, "attention.rotary_emb.inv_freq", layer_i + 2
-        )[0]
-
-        if "attention.bias" in hf_layer.state_dict():
-            state_dict["attention.bias"] = hf_layer.state_dict()["attention.bias"]
-        if "attention.masked_bias" in hf_layer.state_dict():
-            state_dict["attention.masked_bias"] = hf_layer.state_dict()[
-                "attention.masked_bias"
-            ]
-
-        # load state_dict into layer
-        hf_layer.load_state_dict(state_dict)
-
-    # Load final layer norm
-    hf_model.gpt_neox.final_layer_norm.load_state_dict(
-        {
-            "weight": (
-                sum(
-                    get_state(
-                        loaded_tp_ranks,
-                        "norm.weight",
-                        get_key(loaded_config, "num-layers") + 3,
-                    )
-                )
-            )
-            / len(loaded_tp_ranks),
-            "bias": (
-                sum(
-                    get_state(
-                        loaded_tp_ranks,
-                        "norm.bias",
-                        get_key(loaded_config, "num-layers") + 3,
-                    )
-                )
-            )
-            / len(loaded_tp_ranks),
-        }
-    )
-    # output embedding / LM head
-    hf_model.embed_out.load_state_dict(
-        {
-            "weight": torch.cat(
-                get_state(
-                    loaded_tp_ranks,
-                    "final_linear.weight",
-                    get_key(loaded_config, "num-layers") + 4,
-                ),
-                dim=0,
-            ),
-        }
-    )
-
-    del loaded_tp_ranks
-
-    return hf_model
-
-
-def main(input_args=None, overwrite_values=None):
-    from huggingface_hub import create_repo, HfApi
-
-    parser = argparse.ArgumentParser(
-        description="Merge MP partitions and convert to HF Model."
-    )
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        help="Path to NeoX checkpoint, e.g. /path/to/model/global_step143000",
-    )
-    parser.add_argument(
-        "--config_file",
-        type=str,
-        help="Path to config file for the input NeoX checkpoint.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="Output dir, where to save the HF Model, tokenizer, and configs",
-    )
-    parser.add_argument(
-        "--upload",
-        action="store_true",
-        help="Set to true in order to upload to the HF Hub directly.",
-    )
-    args = parser.parse_args(input_args)
-
-    with open(args.config_file) as f:
-        loaded_config = yaml.full_load(f)
-        if overwrite_values:
-            loaded_config.update(overwrite_values)
-
-    hf_model = convert(args.input_dir, loaded_config, args.output_dir)
-
-    hf_model.save_pretrained(args.output_dir)
-
-    # save tokenizer to directory as well, for easy loading of model as a HF model
-    tokenizer_type = get_key(loaded_config, "tokenizer-type")
-
-    if tokenizer_type == "HFTokenizer":
-        print(f"saving tokenizer from file {get_key(loaded_config, 'vocab-file')}")
-        from transformers import PreTrainedTokenizerFast
-
-        tokenizer = PreTrainedTokenizerFast(
-            tokenizer_file=get_key(loaded_config, "vocab-file")
-        )
-        print("loaded tokenizer: ", tokenizer)
-        tokenizer.save_pretrained(args.output_dir)
-        print("tokenizer saved!")
-
-        print(
-            tokenizer.decode(
-                hf_model.generate(
-                    tokenizer.encode("Hello, I am testing ", return_tensors="pt")
-                )[0]
-            )
-        )
-
-    if args.upload:
-        repo_name = input("Provide a repository name for the HF Hub: ")
-        create_repo(repo_name, repo_type="model", private=False, use_auth_token=True)
-
-        api = HfApi()
-        api.upload_folder(
-            folder_path=args.output_dir,
-            repo_id=repo_name,
-            repo_type="model",
-        )
-
-
-if __name__ == "__main__":
-
-    # before running script:
-    # `pip install --upgrade transformers`
-    # `huggingface-cli login`
-    #
-    main()
diff --git a/tools/ckpts/inspect_checkpoints.py b/tools/ckpts/inspect_checkpoints.py
index 0bd5c05b4..a23d456ca 100644
--- a/tools/ckpts/inspect_checkpoints.py
+++ b/tools/ckpts/inspect_checkpoints.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/ckpts/merge20b.py b/tools/ckpts/merge20b.py
index ae1f5a0a3..09029bb5b 100644
--- a/tools/ckpts/merge20b.py
+++ b/tools/ckpts/merge20b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/ckpts/upload.py b/tools/ckpts/upload.py
index 324a7debe..01d585be8 100644
--- a/tools/ckpts/upload.py
+++ b/tools/ckpts/upload.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/datasets/corpora.py b/tools/datasets/corpora.py
index 7d2ec4e8c..2c440dc0a 100644
--- a/tools/datasets/corpora.py
+++ b/tools/datasets/corpora.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/tools/datasets/dataset_token_count.py b/tools/datasets/dataset_token_count.py
new file mode 100644
index 000000000..c9a4ff3ba
--- /dev/null
+++ b/tools/datasets/dataset_token_count.py
@@ -0,0 +1,30 @@
+# Script counts tokens in a pretokenized dataset from preprocess_data.py
+# Necessary for setting batch size, train_iters, etc
+
+import sys
+import os
+
+## Necessary for the import
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.insert(0, project_root)
+
+from megatron.data import indexed_dataset
+import numpy as np
+
+if len(sys.argv) < 2:
+    print(
+        "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
+    )
+    sys.exit(1)
+
+# Access the command-line arguments
+arguments = sys.argv[1:]
+
+for arg in arguments:
+    print("Checking file", arg)
+    try:
+        dataset = indexed_dataset.make_dataset(arg, "mmap")
+        size = np.sum(dataset.sizes)
+        print("Dataset size in tokens is", size)
+    except AttributeError:
+        print("Dataset could not be loaded", arg)
diff --git a/tools/datasets/preprocess_data.py b/tools/datasets/preprocess_data.py
index 74ff251e0..32e656ace 100644
--- a/tools/datasets/preprocess_data.py
+++ b/tools/datasets/preprocess_data.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tools/datasets/preprocess_data_with_mask.py b/tools/datasets/preprocess_data_with_mask.py
index b2fba73ec..ec2fddbe4 100644
--- a/tools/datasets/preprocess_data_with_mask.py
+++ b/tools/datasets/preprocess_data_with_mask.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/train.py b/train.py
index 5f4ddce53..2e4b09954 100644
--- a/train.py
+++ b/train.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2021, EleutherAI
+# Copyright (c) 2024, EleutherAI
 # This file is based on code by the authors denoted below and has been modified from its original version.
 #
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.