From 516df5ee0dcdcd2a2cf90690871b47039bfd1eb2 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 10:56:59 +0200
Subject: [PATCH 01/30] WIP: test

---
 megatron/model/test/test_gpt_model.py | 73 +++++++++++++++++++++++++++
 tests/test_basic.py                   |  2 +
 2 files changed, 75 insertions(+)
 create mode 100644 megatron/model/test/test_gpt_model.py

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
new file mode 100644
index 000000000..3f5da24e3
--- /dev/null
+++ b/megatron/model/test/test_gpt_model.py
@@ -0,0 +1,73 @@
+import unittest
+from unittest.mock import patch
+
+from megatron.model import GPTModel
+
+def default_args():
+    """return a dictionary with key as argument name and value as additional arguments"""
+    VOCAB_FILE=""
+    MERGE_FILE=""
+
+    CHECKPOINT_PATH=""
+    DATA_PATH=""
+
+    return {
+        # GPT_ARGS
+        "--num-layers": "2",
+        "--hidden-size": "128",
+        "--num-attention-heads": "4",
+        "--seq-length": "256",
+        "--max-position-embeddings": "256",
+        "--micro-batch-size": "4",
+        "--global-batch-size": "8",
+        "--lr-decay-iters": "320000",
+        "--lr-decay-style": "cosine",
+        "--lr": "0.00015",
+        "--min-lr": "1.0e-5",
+        "--lr-decay-style": "cosine",
+        "--train-iters": "5000",
+        "--vocab-file": VOCAB_FILE,
+        "--merge-file": MERGE_FILE,
+        "--data-impl": "mmap",
+        "--split": "949,50,1",
+        "--distributed-backend": "nccl",
+        "--weight-decay": "1e-2",
+        "--clip-grad": "1.0",
+        "--lr-warmup-fraction": ".01",
+        "--fp16": "",
+        "--prefix-lm": "",
+        "--reset-attention-mask": "",
+
+        # OUTPUT_ARGS
+        "--log-interval": "10",
+        "--save-interval": "500",
+        "--eval-interval": "100",
+        "--eval-iters": "10",
+        "--checkpoint-activations": "",
+
+        # DATA_ARGS
+        "--save": CHECKPOINT_PATH,
+        "--load": CHECKPOINT_PATH,
+        "--data-path": DATA_PATH,
+
+    }
+
+class MyTestCase(unittest.TestCase):
+    def test_gpt_causal(self):
+        """Test causal invariance, ie past token don't depend on future tokens."""
+        with patch('sys.argv'. ['stuff']):
+
+        model = GPTModel(
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=True,
+            post_process=True
+        )
+        self.assertEqual(True, False)  # add assertion here
+
+    def test_gpt_prefix(self):
+        """Test prefix invariance, ie past tokens in the target don't depend on future tokens."""
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 915d2c100..af5eee3cc 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,3 +1,5 @@
 def test_import():
     import megatron
 
+
+

From eececbb3d7ca5c6b2cba741a2a7ae4ed3dd7c197 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 12:18:05 +0200
Subject: [PATCH 02/30] Still trying to figure out deepspeed

---
 megatron/model/test/test_gpt_model.py | 132 +++++++++++++++++++++++---
 1 file changed, 118 insertions(+), 14 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 3f5da24e3..9e3811d74 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -1,9 +1,15 @@
 import unittest
+from random import randint
 from unittest.mock import patch
 
-from megatron.model import GPTModel
+import torch
 
-def default_args():
+from megatron import initialize_megatron, get_args, get_tokenizer
+from megatron.model import GPTModelPipe
+from pretrain_gpt import get_batch_pipe
+
+
+def get_default_args():
     """return a dictionary with key as argument name and value as additional arguments"""
     VOCAB_FILE=""
     MERGE_FILE=""
@@ -12,6 +18,9 @@ def default_args():
     DATA_PATH=""
 
     return {
+        # Deepspeed
+        "--deepspeed": "",
+
         # GPT_ARGS
         "--num-layers": "2",
         "--hidden-size": "128",
@@ -24,7 +33,6 @@ def default_args():
         "--lr-decay-style": "cosine",
         "--lr": "0.00015",
         "--min-lr": "1.0e-5",
-        "--lr-decay-style": "cosine",
         "--train-iters": "5000",
         "--vocab-file": VOCAB_FILE,
         "--merge-file": MERGE_FILE,
@@ -35,8 +43,6 @@ def default_args():
         "--clip-grad": "1.0",
         "--lr-warmup-fraction": ".01",
         "--fp16": "",
-        "--prefix-lm": "",
-        "--reset-attention-mask": "",
 
         # OUTPUT_ARGS
         "--log-interval": "10",
@@ -49,24 +55,122 @@ def default_args():
         "--save": CHECKPOINT_PATH,
         "--load": CHECKPOINT_PATH,
         "--data-path": DATA_PATH,
-
     }
 
+def flatten_arguments(args):
+    """
+    Converts dictionary argument to a list
+
+    Example: {"arg1": "value1", "arg2": "value2"} -> ["arg1", "value1", "arg2", "value2"]
+    """
+    return [item for key_value in args.items() for item in key_value]
+
 class MyTestCase(unittest.TestCase):
     def test_gpt_causal(self):
         """Test causal invariance, ie past token don't depend on future tokens."""
-        with patch('sys.argv'. ['stuff']):
+        command_args = get_default_args()
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            initialize_megatron()
+            args = get_args()
+            tokenizer = get_tokenizer()
+
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True,
+            )
+
+            token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+            # eod is a special token
+            token_ids[token_ids == tokenizer.eod] += 1
+            token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
+
+            # we set a variation on the inputs
+            changed_index = randint(0, args.seq_length - 1)
+            token_ids_changed = token_ids.clone()
+            token_ids_changed[changed_index] = (token_ids_changed[changed_index] + 1) % args.padded_vocab_size
+
+            model.forward()[get_batch_pipe(token_ids)]
+
+            position_ids = torch.arange(args.seq_length).unsqueeze(0)
+            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
+
+            output = model(input_ids, position_ids, attention_mask)[0]
+
+
+
+            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+
+            # All token in past should be unchanged
+            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
 
-        model = GPTModel(
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=True,
-            post_process=True
-        )
-        self.assertEqual(True, False)  # add assertion here
 
     def test_gpt_prefix(self):
         """Test prefix invariance, ie past tokens in the target don't depend on future tokens."""
+        command_args = get_default_args()
+
+        command_args["--prefix-lm"] = "",
+        command_args["--reset-attention-mask"] = "",
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            initialize_megatron()
+            args = get_args()
+
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=True,
+                post_process=True
+            )
+
+            input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+            position_ids = torch.arange(args.seq_length).unsqueeze(0)
+            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
+
+            output = model(input_ids, position_ids, attention_mask)[0]
+
+            changed_index = randint(0, args.seq_length - 1)
+            input_ids_changed = input_ids.clone()
+            input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size
+
+            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+
+            # All token in past should be unchanged
+            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
+
+    def test_gpt_rotary_embeddings(self):
+        """Test rotary embeddings"""
+        command_args = get_default_args()
+
+        del command_args["--max-position-embeddings"]
+        command_args["--position-embedding-type"] = "rotary"
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            initialize_megatron()
+            args = get_args()
+
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=True,
+                post_process=True
+            )
+
+            input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+            position_ids = torch.arange(args.seq_length).unsqueeze(0)
+            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
+
+            output = model(input_ids, position_ids, attention_mask)[0]
+
+            changed_index = randint(0, args.seq_length - 1)
+            input_ids_changed = input_ids.clone()
+            input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size
+
+            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+
+            # All token in past should be unchanged
+            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
 
 
 if __name__ == '__main__':

From b78dfaad3201460ebd79cd3bbd9a192169e90746 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 14:42:11 +0200
Subject: [PATCH 03/30] WIP

---
 megatron/model/test/test_gpt_model.py | 96 ++++++++++++---------------
 pretrain_prefix_lm.py                 |  2 +-
 2 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 9e3811d74..ababd81f6 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -3,11 +3,12 @@
 from unittest.mock import patch
 
 import torch
+from deepspeed import deepspeed
 
 from megatron import initialize_megatron, get_args, get_tokenizer
 from megatron.model import GPTModelPipe
-from pretrain_gpt import get_batch_pipe
-
+from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
+from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
 
 def get_default_args():
     """return a dictionary with key as argument name and value as additional arguments"""
@@ -75,10 +76,7 @@ def test_gpt_causal(self):
             args = get_args()
             tokenizer = get_tokenizer()
 
-            model = GPTModelPipe(
-                num_tokentypes=0,
-                parallel_output=True,
-            )
+            model_engine = deepspeed.init_inference(gpt_model_provider())
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -86,28 +84,28 @@ def test_gpt_causal(self):
             token_ids[token_ids == tokenizer.eod] += 1
             token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
-            # we set a variation on the inputs
-            changed_index = randint(0, args.seq_length - 1)
-            token_ids_changed = token_ids.clone()
-            token_ids_changed[changed_index] = (token_ids_changed[changed_index] + 1) % args.padded_vocab_size
-
-            model.forward()[get_batch_pipe(token_ids)]
-
-            position_ids = torch.arange(args.seq_length).unsqueeze(0)
-            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
-
-            output = model(input_ids, position_ids, attention_mask)[0]
-
+            # process batch
+            input_batch = get_gpt_batch_pipe(token_ids)[0]
 
+            # get a modified version of the first batch
+            changed_index = randint(0, args.seq_length - 2)
+            input_token_ids_changed = input_batch[0].clone()
+            # We randomly increment the index by one of that index
+            input_token_ids_changed[changed_index] = (input_token_ids_changed[changed_index] + 1) % args.padded_vocab_size
 
-            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+            output = model_engine(input_batch)
+            output_changed = model_engine((input_token_ids_changed, *input_batch[1:]))
 
             # All token in past should be unchanged
             self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
 
 
     def test_gpt_prefix(self):
-        """Test prefix invariance, ie past tokens in the target don't depend on future tokens."""
+        """
+        Test prefix invariances:
+            - Past tokens in the target don't depend on future tokens.
+            - Input tokens
+        """
         command_args = get_default_args()
 
         command_args["--prefix-lm"] = "",
@@ -116,25 +114,28 @@ def test_gpt_prefix(self):
         with patch('sys.argv', flatten_arguments(command_args)):
             initialize_megatron()
             args = get_args()
+            tokenizer = get_tokenizer()
+
+            model_engine = deepspeed.init_inference(prefix_lm_model_provider())
 
-            model = GPTModelPipe(
-                num_tokentypes=0,
-                parallel_output=True,
-                pre_process=True,
-                post_process=True
-            )
+            token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
-            input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
-            position_ids = torch.arange(args.seq_length).unsqueeze(0)
-            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
+            # eod is a special token
+            token_ids[token_ids == tokenizer.eod] += 1
+            token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
-            output = model(input_ids, position_ids, attention_mask)[0]
+            # process batch
+            input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids)
 
-            changed_index = randint(0, args.seq_length - 1)
-            input_ids_changed = input_ids.clone()
-            input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size
+            # get a modified version of the first batch
+            changed_index = randint(0, args.seq_length - 2)
+            input_token_ids_changed = input_batch[0].clone()
+            # We randomly increment the index by one of that index
+            input_token_ids_changed[changed_index] = (input_token_ids_changed[
+                                                          changed_index] + 1) % args.padded_vocab_size
 
-            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+            output = model_engine(input_batch)
+            output_changed = model_engine((input_token_ids_changed, *input_batch[1:]))
 
             # All token in past should be unchanged
             self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
@@ -149,29 +150,20 @@ def test_gpt_rotary_embeddings(self):
         with patch('sys.argv', flatten_arguments(command_args)):
             initialize_megatron()
             args = get_args()
+            tokenizer = get_tokenizer()
 
-            model = GPTModelPipe(
-                num_tokentypes=0,
-                parallel_output=True,
-                pre_process=True,
-                post_process=True
-            )
-
-            input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
-            position_ids = torch.arange(args.seq_length).unsqueeze(0)
-            attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length))
-
-            output = model(input_ids, position_ids, attention_mask)[0]
+            model_engine = deepspeed.init_inference(gpt_model_provider())
 
-            changed_index = randint(0, args.seq_length - 1)
-            input_ids_changed = input_ids.clone()
-            input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size
+            token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
-            output_changed = model(input_ids_changed, position_ids, attention_mask)[0]
+            # eod is a special token
+            token_ids[token_ids == tokenizer.eod] += 1
+            token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
-            # All token in past should be unchanged
-            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
+            # process batch
+            input_batch = get_gpt_batch_pipe(token_ids)[0]
 
+            model_engine(input_batch)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py
index 0f4928021..5b168e8b9 100644
--- a/pretrain_prefix_lm.py
+++ b/pretrain_prefix_lm.py
@@ -145,7 +145,7 @@ def get_batch_pipe(data):
         loss_on_targets_only=args.loss_on_targets_only
     )
 
-    return (tokens, position_ids, attention_mask), (labels, loss_mask)
+    return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices
 
 def loss_func(loss_mask, output_tensor):
     losses = output_tensor.float()

From 0bfc2f4fb09ad95513d68c170ed630aac9c98c7e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 15:23:16 +0200
Subject: [PATCH 04/30] Test test

---
 megatron/model/test/test_gpt_model.py | 99 ++++++++++++++++++++++-----
 1 file changed, 82 insertions(+), 17 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index ababd81f6..f5657c7f1 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -87,24 +87,35 @@ def test_gpt_causal(self):
             # process batch
             input_batch = get_gpt_batch_pipe(token_ids)[0]
 
-            # get a modified version of the first batch
+            # get a modified version of the first batch, we change a specific index
             changed_index = randint(0, args.seq_length - 2)
             input_token_ids_changed = input_batch[0].clone()
-            # We randomly increment the index by one of that index
-            input_token_ids_changed[changed_index] = (input_token_ids_changed[changed_index] + 1) % args.padded_vocab_size
+            # We increment the token_id by one for that index in order to artificially change the sequence.
+            input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
 
             output = model_engine(input_batch)
             output_changed = model_engine((input_token_ids_changed, *input_batch[1:]))
 
             # All token in past should be unchanged
-            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
+            self.assertTrue(
+                torch.all(
+                    output[:, :changed_index].eq(output_changed[:, :changed_index])
+                )
+            )
+            # All tokens in the future should have changed
+            self.assertFalse(
+                torch.any(
+                    output[:, changed_index:].eq(output_changed[:, changed_index:])
+                )
+            )
 
 
     def test_gpt_prefix(self):
         """
         Test prefix invariances:
-            - Past tokens in the target don't depend on future tokens.
-            - Input tokens
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
         """
         command_args = get_default_args()
 
@@ -120,25 +131,79 @@ def test_gpt_prefix(self):
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
-            # eod is a special token
+            # eod is a special token, this also guarantees that the whole row is considered as a document.
             token_ids[token_ids == tokenizer.eod] += 1
             token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
-            # process batch
-            input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids)
+            # process batch to have non empty prefix
+            for i in range(9, -1, -1):
+                input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids)
+                if (prefix_indices[0][0] != 0):
+                    break
+                if i == 0:
+                    # FIXME: find a better way to not obtain empty prefix
+                    raise ValueError("Could not obtain non pathological case where prefix is not empty")
+
+            output = model_engine(input_batch)
 
+            ## --------------- CHANGE A TARGET TOKEN ---------------------------
             # get a modified version of the first batch
-            changed_index = randint(0, args.seq_length - 2)
-            input_token_ids_changed = input_batch[0].clone()
-            # We randomly increment the index by one of that index
-            input_token_ids_changed[changed_index] = (input_token_ids_changed[
-                                                          changed_index] + 1) % args.padded_vocab_size
+            changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document
+            token_ids_changed_target = input_batch[0].clone()
+            # We increment the token id on the changed index.
+            token_ids_changed_target[changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
+            # make sure we're not changing a token to eod as it's a special token
+            token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
+            token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
 
-            output = model_engine(input_batch)
-            output_changed = model_engine((input_token_ids_changed, *input_batch[1:]))
+            # Test change
+            output_changed_target = model_engine((token_ids_changed_target, *input_batch[1:]))
 
             # All token in past should be unchanged
-            self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index])
+            self.assertTrue(
+                torch.all(
+                    output[0, :changed_target_index].eq(output_changed_target[0, :changed_target_index])
+                )
+            )
+            # All tokens in the future should have changed
+            self.assertFalse(
+                torch.any(
+                    output[0, changed_target_index:].eq(output_changed_target[0, changed_target_index:])
+                )
+            )
+            # Unchanged changed rows should not change either
+            self.assertTrue(
+                torch.all(
+                    output[1, :].eq(output_changed_target[1, :])
+                )
+            )
+
+            ## --------------- CHANGE AN INPUT TOKEN ---------------------------
+            # Let's change the the last prefix token and make sure that the first token changed
+            last_prefix_index = prefix_indices[0][0] - 1  # guaranteed to be positive as we avoid pathological case previously
+            token_ids_changed_input = input_batch[0].clone()
+            #  We increment the token id on the changed index.
+            token_ids_changed_input[changed_target_index] = (token_ids_changed_input[
+                                                                 0, last_prefix_index] + 1) % args.padded_vocab_size
+            # make sure we're not changing a token to eod as it's a special token
+            token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
+            token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
+
+            output_changed_input = model_engine((token_ids_changed_input, *input_batch[1:]))
+
+            # All tokens should be changed
+            self.assertFalse(
+                torch.any(
+                    output[0, :].eq(output_changed_input[0, :])
+                )
+            )
+            # Unchanged changed rows should not change either
+            self.assertTrue(
+                torch.all(
+                    output[1, :].eq(output_changed_input[1, :])
+                )
+            )
+
 
     def test_gpt_rotary_embeddings(self):
         """Test rotary embeddings"""

From b8016371a395c23380dc9975a51a55e9d4c992da Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 19:16:04 +0200
Subject: [PATCH 05/30] Test how to setup deepspeed in unit tests

---
 megatron/model/test/test_gpt_model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index f5657c7f1..401eb958d 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -19,9 +19,6 @@ def get_default_args():
     DATA_PATH=""
 
     return {
-        # Deepspeed
-        "--deepspeed": "",
-
         # GPT_ARGS
         "--num-layers": "2",
         "--hidden-size": "128",

From 2b81d406e53c646ba380bc8f4c9e6c68c0d6e85b Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 21:53:28 +0200
Subject: [PATCH 06/30] Test something else

---
 megatron/model/test/test_gpt_model.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 401eb958d..f6a3f5710 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -6,7 +6,6 @@
 from deepspeed import deepspeed
 
 from megatron import initialize_megatron, get_args, get_tokenizer
-from megatron.model import GPTModelPipe
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
 from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
 
@@ -57,11 +56,13 @@ def get_default_args():
 
 def flatten_arguments(args):
     """
-    Converts dictionary argument to a list
+    Converts dictionary argument to a list.
 
-    Example: {"arg1": "value1", "arg2": "value2"} -> ["arg1", "value1", "arg2", "value2"]
+    Note: we add "IGNORED" at the beginning as this value is ignored by the argparser
+
+    Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
     """
-    return [item for key_value in args.items() for item in key_value]
+    return ["IGNORED"] + [item for key_value in args.items() for item in key_value]
 
 class MyTestCase(unittest.TestCase):
     def test_gpt_causal(self):

From aeca8c146d033e7a1fa8b92da0bd603546c8c7c3 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 22:10:06 +0200
Subject: [PATCH 07/30] Empty strings might be problematic

---
 megatron/model/test/test_gpt_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index f6a3f5710..4c7e3c416 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -62,7 +62,7 @@ def flatten_arguments(args):
 
     Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
     """
-    return ["IGNORED"] + [item for key_value in args.items() for item in key_value]
+    return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
 class MyTestCase(unittest.TestCase):
     def test_gpt_causal(self):

From 520ef720ca8accbc950375720becc11152330b09 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 22:25:34 +0200
Subject: [PATCH 08/30] Remove unecessary arguments

---
 megatron/model/test/test_gpt_model.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 4c7e3c416..9417d2fee 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -11,12 +11,6 @@
 
 def get_default_args():
     """return a dictionary with key as argument name and value as additional arguments"""
-    VOCAB_FILE=""
-    MERGE_FILE=""
-
-    CHECKPOINT_PATH=""
-    DATA_PATH=""
-
     return {
         # GPT_ARGS
         "--num-layers": "2",
@@ -31,8 +25,8 @@ def get_default_args():
         "--lr": "0.00015",
         "--min-lr": "1.0e-5",
         "--train-iters": "5000",
-        "--vocab-file": VOCAB_FILE,
-        "--merge-file": MERGE_FILE,
+        "--tokenizer": "PretrainedFromHF",
+        "--tokenizer-name-or-path": "gpt2",
         "--data-impl": "mmap",
         "--split": "949,50,1",
         "--distributed-backend": "nccl",
@@ -49,9 +43,6 @@ def get_default_args():
         "--checkpoint-activations": "",
 
         # DATA_ARGS
-        "--save": CHECKPOINT_PATH,
-        "--load": CHECKPOINT_PATH,
-        "--data-path": DATA_PATH,
     }
 
 def flatten_arguments(args):

From 37522b41c0edfce359f1890ec70764d13be0257e Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 22:55:35 +0200
Subject: [PATCH 09/30] Woops

---
 megatron/model/test/test_gpt_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 9417d2fee..9c2b0e60d 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -25,7 +25,7 @@ def get_default_args():
         "--lr": "0.00015",
         "--min-lr": "1.0e-5",
         "--train-iters": "5000",
-        "--tokenizer": "PretrainedFromHF",
+        "--tokenizer-type": "PretrainedFromHF",
         "--tokenizer-name-or-path": "gpt2",
         "--data-impl": "mmap",
         "--split": "949,50,1",

From 76f01fecd5dfaa475c39b088ad5944ee905298bb Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 23:16:38 +0200
Subject: [PATCH 10/30] Remove global variables at the end of each test and
 init deepspeed

---
 megatron/model/test/test_gpt_model.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 9c2b0e60d..16573a827 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -56,6 +56,25 @@ def flatten_arguments(args):
     return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
 class MyTestCase(unittest.TestCase):
+    def setUpClass(cls) -> None:
+        deepspeed.init_distributed()
+
+    def tearDown(self) -> None:
+        # We reset all global variables
+        global _GLOBAL_ARGS
+        global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
+        global _GLOBAL_TOKENIZER
+        global _GLOBAL_TENSORBOARD_WRITER
+        global _GLOBAL_ADLR_AUTORESUME
+        global _GLOBAL_TIMERS
+
+        _GLOBAL_ARGS = None
+        _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        _GLOBAL_TOKENIZER = None
+        _GLOBAL_TENSORBOARD_WRITER = None
+        _GLOBAL_ADLR_AUTORESUME = None
+        _GLOBAL_TIMERS = None
+
     def test_gpt_causal(self):
         """Test causal invariance, ie past token don't depend on future tokens."""
         command_args = get_default_args()

From 188b33b60e0f05c59e650a427d7842949c1311b0 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 23:17:59 +0200
Subject: [PATCH 11/30] Woops

---
 megatron/model/test/test_gpt_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 16573a827..0db89f02c 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -56,7 +56,7 @@ def flatten_arguments(args):
     return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
 class MyTestCase(unittest.TestCase):
-    def setUpClass(cls) -> None:
+    def setUpClass(self) -> None:
         deepspeed.init_distributed()
 
     def tearDown(self) -> None:

From 57191c4038a17b4ef93d39c05415df4417acbd09 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Fri, 6 Aug 2021 23:20:00 +0200
Subject: [PATCH 12/30] Maybe adding classmethod

---
 megatron/model/test/test_gpt_model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 0db89f02c..6cdf14185 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -56,7 +56,8 @@ def flatten_arguments(args):
     return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
 class MyTestCase(unittest.TestCase):
-    def setUpClass(self) -> None:
+    @classmethod
+    def setUpClass(cls) -> None:
         deepspeed.init_distributed()
 
     def tearDown(self) -> None:

From 1389e6dc30841838552ebae6299b8e1bacf49543 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 03:16:08 +0200
Subject: [PATCH 13/30] Woops

---
 megatron/model/test/test_gpt_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 6cdf14185..465305961 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -2,8 +2,8 @@
 from random import randint
 from unittest.mock import patch
 
+import deepspeed
 import torch
-from deepspeed import deepspeed
 
 from megatron import initialize_megatron, get_args, get_tokenizer
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe

From e45854085c3645a6f96ff9088e8a45f1d54ac5e8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 03:19:04 +0200
Subject: [PATCH 14/30] Add debug print to check that tear down happends

---
 megatron/model/test/test_gpt_model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 465305961..43b736d6f 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -62,6 +62,7 @@ def setUpClass(cls) -> None:
 
     def tearDown(self) -> None:
         # We reset all global variables
+        print("Tearing down args.")
         global _GLOBAL_ARGS
         global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
         global _GLOBAL_TOKENIZER

From d7f331f06d502a68679a40b594116eb962f4d542 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 03:25:27 +0200
Subject: [PATCH 15/30] Reset global variables before

---
 megatron/model/test/test_gpt_model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 43b736d6f..0820d9075 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -60,9 +60,8 @@ class MyTestCase(unittest.TestCase):
     def setUpClass(cls) -> None:
         deepspeed.init_distributed()
 
-    def tearDown(self) -> None:
+    def setUp(self) -> None:
         # We reset all global variables
-        print("Tearing down args.")
         global _GLOBAL_ARGS
         global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
         global _GLOBAL_TOKENIZER

From af9a7164411a81c05296e6060cbbfb31e7902453 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:10:20 +0200
Subject: [PATCH 16/30] Let's test this

---
 megatron/model/test/test_gpt_model.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 0820d9075..2d88c658f 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -1,3 +1,4 @@
+import argparse
 import unittest
 from random import randint
 from unittest.mock import patch
@@ -6,6 +7,7 @@
 import torch
 
 from megatron import initialize_megatron, get_args, get_tokenizer
+from megatron.training import setup_model_and_optimizer
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
 from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
 
@@ -85,7 +87,7 @@ def test_gpt_causal(self):
             args = get_args()
             tokenizer = get_tokenizer()
 
-            model_engine = deepspeed.init_inference(gpt_model_provider())
+            model, _, _ = setup_model_and_optimizer(gpt_model_provider)
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -239,5 +241,10 @@ def test_gpt_rotary_embeddings(self):
 
             model_engine(input_batch)
 
+def get_deepspeed_args():
+    parser = argparse.ArgumentParser()
+    return deepspeed.add_config_arguments(parser)
+
 if __name__ == '__main__':
+    get_deepspeed_args()
     unittest.main()

From b90812497104036f7936edc96c634ed7f148cd3c Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:21:15 +0200
Subject: [PATCH 17/30] Try something else

---
 megatron/model/test/test_gpt_model.py | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 2d88c658f..956d32b3b 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -6,7 +6,7 @@
 import deepspeed
 import torch
 
-from megatron import initialize_megatron, get_args, get_tokenizer
+from megatron import initialize_megatron, get_args, get_tokenizer, global_vars
 from megatron.training import setup_model_and_optimizer
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
 from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
@@ -64,19 +64,12 @@ def setUpClass(cls) -> None:
 
     def setUp(self) -> None:
         # We reset all global variables
-        global _GLOBAL_ARGS
-        global _GLOBAL_NUM_MICROBATCHES_CALCULATOR
-        global _GLOBAL_TOKENIZER
-        global _GLOBAL_TENSORBOARD_WRITER
-        global _GLOBAL_ADLR_AUTORESUME
-        global _GLOBAL_TIMERS
-
-        _GLOBAL_ARGS = None
-        _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
-        _GLOBAL_TOKENIZER = None
-        _GLOBAL_TENSORBOARD_WRITER = None
-        _GLOBAL_ADLR_AUTORESUME = None
-        _GLOBAL_TIMERS = None
+        global_vars._GLOBAL_ARGS = None
+        global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
+        global_vars._GLOBAL_TOKENIZER = None
+        global_vars._GLOBAL_TENSORBOARD_WRITER = None
+        global_vars._GLOBAL_ADLR_AUTORESUME = None
+        global_vars._GLOBAL_TIMERS = None
 
     def test_gpt_causal(self):
         """Test causal invariance, ie past token don't depend on future tokens."""
@@ -96,7 +89,7 @@ def test_gpt_causal(self):
             token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
             # process batch
-            input_batch = get_gpt_batch_pipe(token_ids)[0]
+            input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
 
             # get a modified version of the first batch, we change a specific index
             changed_index = randint(0, args.seq_length - 2)
@@ -104,8 +97,8 @@ def test_gpt_causal(self):
             # We increment the token_id by one for that index in order to artificially change the sequence.
             input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
 
-            output = model_engine(input_batch)
-            output_changed = model_engine((input_token_ids_changed, *input_batch[1:]))
+            output = model(input_batch)
+            output_changed = model((input_token_ids_changed, *input_batch[1:]))
 
             # All token in past should be unchanged
             self.assertTrue(

From 28cea9564037f1eb6a96eb28967d9f77bd8feb09 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:26:07 +0200
Subject: [PATCH 18/30] WIP

---
 megatron/model/test/test_gpt_model.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 956d32b3b..c5760db44 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -95,7 +95,7 @@ def test_gpt_causal(self):
             changed_index = randint(0, args.seq_length - 2)
             input_token_ids_changed = input_batch[0].clone()
             # We increment the token_id by one for that index in order to artificially change the sequence.
-            input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
+            input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
 
             output = model(input_batch)
             output_changed = model((input_token_ids_changed, *input_batch[1:]))
@@ -131,7 +131,7 @@ def test_gpt_prefix(self):
             args = get_args()
             tokenizer = get_tokenizer()
 
-            model_engine = deepspeed.init_inference(prefix_lm_model_provider())
+            model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider)
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -148,7 +148,7 @@ def test_gpt_prefix(self):
                     # FIXME: find a better way to not obtain empty prefix
                     raise ValueError("Could not obtain non pathological case where prefix is not empty")
 
-            output = model_engine(input_batch)
+            output = model(input_batch)
 
             ## --------------- CHANGE A TARGET TOKEN ---------------------------
             # get a modified version of the first batch
@@ -161,7 +161,7 @@ def test_gpt_prefix(self):
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
 
             # Test change
-            output_changed_target = model_engine((token_ids_changed_target, *input_batch[1:]))
+            output_changed_target = model((token_ids_changed_target, *input_batch[1:]))
 
             # All token in past should be unchanged
             self.assertTrue(
@@ -193,7 +193,7 @@ def test_gpt_prefix(self):
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
 
-            output_changed_input = model_engine((token_ids_changed_input, *input_batch[1:]))
+            output_changed_input = model((token_ids_changed_input, *input_batch[1:]))
 
             # All tokens should be changed
             self.assertFalse(
@@ -221,7 +221,7 @@ def test_gpt_rotary_embeddings(self):
             args = get_args()
             tokenizer = get_tokenizer()
 
-            model_engine = deepspeed.init_inference(gpt_model_provider())
+            model, _, _ = setup_model_and_optimizer(gpt_model_provider)
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -232,7 +232,7 @@ def test_gpt_rotary_embeddings(self):
             # process batch
             input_batch = get_gpt_batch_pipe(token_ids)[0]
 
-            model_engine(input_batch)
+            model(input_batch)
 
 def get_deepspeed_args():
     parser = argparse.ArgumentParser()

From 642ef91de06fa964f323a69dbe949bcb9f295f14 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:35:42 +0200
Subject: [PATCH 19/30] More fix

---
 megatron/model/test/test_gpt_model.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index c5760db44..c9203fdf2 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -37,6 +37,9 @@ def get_default_args():
         "--lr-warmup-fraction": ".01",
         "--fp16": "",
 
+        "--attention-dropout": "0",
+        "--hidden-dropout": "0",
+
         # OUTPUT_ARGS
         "--log-interval": "10",
         "--save-interval": "500",
@@ -81,6 +84,7 @@ def test_gpt_causal(self):
             tokenizer = get_tokenizer()
 
             model, _, _ = setup_model_and_optimizer(gpt_model_provider)
+            model = model[0]
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -123,8 +127,8 @@ def test_gpt_prefix(self):
         """
         command_args = get_default_args()
 
-        command_args["--prefix-lm"] = "",
-        command_args["--reset-attention-mask"] = "",
+        command_args["--prefix-lm"] = ""
+        command_args["--reset-attention-mask"] = ""
 
         with patch('sys.argv', flatten_arguments(command_args)):
             initialize_megatron()
@@ -132,6 +136,7 @@ def test_gpt_prefix(self):
             tokenizer = get_tokenizer()
 
             model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider)
+            model = model[0]
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -141,7 +146,7 @@ def test_gpt_prefix(self):
 
             # process batch to have non empty prefix
             for i in range(9, -1, -1):
-                input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids)
+                input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids})
                 if (prefix_indices[0][0] != 0):
                     break
                 if i == 0:
@@ -222,6 +227,7 @@ def test_gpt_rotary_embeddings(self):
             tokenizer = get_tokenizer()
 
             model, _, _ = setup_model_and_optimizer(gpt_model_provider)
+            model = model[0]
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
 
@@ -230,7 +236,7 @@ def test_gpt_rotary_embeddings(self):
             token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size
 
             # process batch
-            input_batch = get_gpt_batch_pipe(token_ids)[0]
+            input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
 
             model(input_batch)
 

From 5143ce68effb53c354f9a7e0cdf0247bb5dba9a0 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:37:44 +0200
Subject: [PATCH 20/30] More fix

---
 megatron/model/test/test_gpt_model.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index c9203fdf2..09f985b9b 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -101,8 +101,8 @@ def test_gpt_causal(self):
             # We increment the token_id by one for that index in order to artificially change the sequence.
             input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
 
-            output = model(input_batch)
-            output_changed = model((input_token_ids_changed, *input_batch[1:]))
+            output = model(*input_batch)
+            output_changed = model(input_token_ids_changed, *input_batch[1:])
 
             # All token in past should be unchanged
             self.assertTrue(
@@ -153,7 +153,7 @@ def test_gpt_prefix(self):
                     # FIXME: find a better way to not obtain empty prefix
                     raise ValueError("Could not obtain non pathological case where prefix is not empty")
 
-            output = model(input_batch)
+            output = model(*input_batch)
 
             ## --------------- CHANGE A TARGET TOKEN ---------------------------
             # get a modified version of the first batch
@@ -166,7 +166,7 @@ def test_gpt_prefix(self):
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
 
             # Test change
-            output_changed_target = model((token_ids_changed_target, *input_batch[1:]))
+            output_changed_target = model(token_ids_changed_target, *input_batch[1:])
 
             # All token in past should be unchanged
             self.assertTrue(
@@ -198,7 +198,7 @@ def test_gpt_prefix(self):
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
 
-            output_changed_input = model((token_ids_changed_input, *input_batch[1:]))
+            output_changed_input = model(token_ids_changed_input, *input_batch[1:])
 
             # All tokens should be changed
             self.assertFalse(
@@ -238,7 +238,7 @@ def test_gpt_rotary_embeddings(self):
             # process batch
             input_batch = get_gpt_batch_pipe({"text": token_ids})[0]
 
-            model(input_batch)
+            model(*input_batch)
 
 def get_deepspeed_args():
     parser = argparse.ArgumentParser()

From 8cfb92ce95dad0b32c9368ea7132d3853cec565d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 17:41:53 +0200
Subject: [PATCH 21/30] More stuff to fix

---
 megatron/model/test/test_gpt_model.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 09f985b9b..f8e80f87a 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -111,6 +111,10 @@ def test_gpt_causal(self):
                 )
             )
             # All tokens in the future should have changed
+            print(torch.any(
+                    output[:, changed_index:].eq(output_changed[:, changed_index:])
+                )
+            )
             self.assertFalse(
                 torch.any(
                     output[:, changed_index:].eq(output_changed[:, changed_index:])
@@ -160,7 +164,7 @@ def test_gpt_prefix(self):
             changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document
             token_ids_changed_target = input_batch[0].clone()
             # We increment the token id on the changed index.
-            token_ids_changed_target[changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
+            token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
@@ -192,7 +196,7 @@ def test_gpt_prefix(self):
             last_prefix_index = prefix_indices[0][0] - 1  # guaranteed to be positive as we avoid pathological case previously
             token_ids_changed_input = input_batch[0].clone()
             #  We increment the token id on the changed index.
-            token_ids_changed_input[changed_target_index] = (token_ids_changed_input[
+            token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[
                                                                  0, last_prefix_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1

From 9dbd9399590cf74cbe9809e4e28aab5d8385373d Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 18:03:29 +0200
Subject: [PATCH 22/30] We really want to compare vectors and not coordinates

---
 megatron/model/test/test_gpt_model.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index f8e80f87a..e5f8040db 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -60,6 +60,10 @@ def flatten_arguments(args):
     """
     return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
+def equal_vectors(tensor1, tensor2, dim = -1):
+    """View tensor1 and tensor2 as a list of vectors, and compute equality"""
+    return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
+
 class MyTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
@@ -106,19 +110,11 @@ def test_gpt_causal(self):
 
             # All token in past should be unchanged
             self.assertTrue(
-                torch.all(
-                    output[:, :changed_index].eq(output_changed[:, :changed_index])
-                )
+                torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index]))
             )
             # All tokens in the future should have changed
-            print(torch.any(
-                    output[:, changed_index:].eq(output_changed[:, changed_index:])
-                )
-            )
             self.assertFalse(
-                torch.any(
-                    output[:, changed_index:].eq(output_changed[:, changed_index:])
-                )
+                torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             )
 
 
@@ -175,19 +171,19 @@ def test_gpt_prefix(self):
             # All token in past should be unchanged
             self.assertTrue(
                 torch.all(
-                    output[0, :changed_target_index].eq(output_changed_target[0, :changed_target_index])
+                    equal_vectors(output[0, :changed_target_index], output_changed_target[0, :changed_target_index])
                 )
             )
             # All tokens in the future should have changed
             self.assertFalse(
                 torch.any(
-                    output[0, changed_target_index:].eq(output_changed_target[0, changed_target_index:])
+                    equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:])
                 )
             )
             # Unchanged changed rows should not change either
             self.assertTrue(
                 torch.all(
-                    output[1, :].eq(output_changed_target[1, :])
+                    equal_vectors(output[1, :], output_changed_target[1, :])
                 )
             )
 
@@ -207,13 +203,13 @@ def test_gpt_prefix(self):
             # All tokens should be changed
             self.assertFalse(
                 torch.any(
-                    output[0, :].eq(output_changed_input[0, :])
+                    equal_vectors(output[0, :], output_changed_input[0, :])
                 )
             )
             # Unchanged changed rows should not change either
             self.assertTrue(
                 torch.all(
-                    output[1, :].eq(output_changed_input[1, :])
+                    equal_vectors(output[1, :], output_changed_input[1, :])
                 )
             )
 

From 82c6ca1d05e7cdb842ff0daa7d4e97c71f02fa41 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 18:05:55 +0200
Subject: [PATCH 23/30] Reformat

---
 megatron/model/test/test_gpt_model.py | 28 ++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index e5f8040db..2fe66d078 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -1,4 +1,3 @@
-import argparse
 import unittest
 from random import randint
 from unittest.mock import patch
@@ -11,6 +10,7 @@
 from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe
 from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe
 
+
 def get_default_args():
     """return a dictionary with key as argument name and value as additional arguments"""
     return {
@@ -50,6 +50,7 @@ def get_default_args():
         # DATA_ARGS
     }
 
+
 def flatten_arguments(args):
     """
     Converts dictionary argument to a list.
@@ -60,10 +61,12 @@ def flatten_arguments(args):
     """
     return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""]
 
-def equal_vectors(tensor1, tensor2, dim = -1):
+
+def equal_vectors(tensor1, tensor2, dim=-1):
     """View tensor1 and tensor2 as a list of vectors, and compute equality"""
     return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0
 
+
 class MyTestCase(unittest.TestCase):
     @classmethod
     def setUpClass(cls) -> None:
@@ -103,7 +106,8 @@ def test_gpt_causal(self):
             changed_index = randint(0, args.seq_length - 2)
             input_token_ids_changed = input_batch[0].clone()
             # We increment the token_id by one for that index in order to artificially change the sequence.
-            input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size
+            input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:,
+                                                         changed_index] + 1) % args.padded_vocab_size
 
             output = model(*input_batch)
             output_changed = model(input_token_ids_changed, *input_batch[1:])
@@ -113,11 +117,11 @@ def test_gpt_causal(self):
                 torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index]))
             )
             # All tokens in the future should have changed
+            print(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             self.assertFalse(
                 torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             )
 
-
     def test_gpt_prefix(self):
         """
         Test prefix invariances:
@@ -157,10 +161,12 @@ def test_gpt_prefix(self):
 
             ## --------------- CHANGE A TARGET TOKEN ---------------------------
             # get a modified version of the first batch
-            changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document
+            changed_target_index = prefix_indices[0][
+                0]  # guaranteed to exist as each row has at least one partial document
             token_ids_changed_target = input_batch[0].clone()
             # We increment the token id on the changed index.
-            token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
+            token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[
+                                                                     0, changed_target_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
@@ -189,11 +195,12 @@ def test_gpt_prefix(self):
 
             ## --------------- CHANGE AN INPUT TOKEN ---------------------------
             # Let's change the the last prefix token and make sure that the first token changed
-            last_prefix_index = prefix_indices[0][0] - 1  # guaranteed to be positive as we avoid pathological case previously
+            last_prefix_index = prefix_indices[0][
+                                    0] - 1  # guaranteed to be positive as we avoid pathological case previously
             token_ids_changed_input = input_batch[0].clone()
             #  We increment the token id on the changed index.
             token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[
-                                                                 0, last_prefix_index] + 1) % args.padded_vocab_size
+                                                                    0, last_prefix_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size
@@ -213,7 +220,6 @@ def test_gpt_prefix(self):
                 )
             )
 
-
     def test_gpt_rotary_embeddings(self):
         """Test rotary embeddings"""
         command_args = get_default_args()
@@ -240,10 +246,6 @@ def test_gpt_rotary_embeddings(self):
 
             model(*input_batch)
 
-def get_deepspeed_args():
-    parser = argparse.ArgumentParser()
-    return deepspeed.add_config_arguments(parser)
 
 if __name__ == '__main__':
-    get_deepspeed_args()
     unittest.main()

From 7c6ea150d147e89481a5d9bd4561901c9667d38f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 18:21:56 +0200
Subject: [PATCH 24/30] check something out

---
 megatron/model/test/test_gpt_model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index 2fe66d078..b6e8fe854 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -117,7 +117,6 @@ def test_gpt_causal(self):
                 torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index]))
             )
             # All tokens in the future should have changed
-            print(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             self.assertFalse(
                 torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             )
@@ -161,8 +160,8 @@ def test_gpt_prefix(self):
 
             ## --------------- CHANGE A TARGET TOKEN ---------------------------
             # get a modified version of the first batch
-            changed_target_index = prefix_indices[0][
-                0]  # guaranteed to exist as each row has at least one partial document
+            # guaranteed to exist as each row has at least one partial document
+            changed_target_index = prefix_indices[0][0]
             token_ids_changed_target = input_batch[0].clone()
             # We increment the token id on the changed index.
             token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[
@@ -195,8 +194,8 @@ def test_gpt_prefix(self):
 
             ## --------------- CHANGE AN INPUT TOKEN ---------------------------
             # Let's change the the last prefix token and make sure that the first token changed
-            last_prefix_index = prefix_indices[0][
-                                    0] - 1  # guaranteed to be positive as we avoid pathological case previously
+            # guaranteed to be positive as we avoid pathological case previously
+            last_prefix_index = prefix_indices[0][0] - 1
             token_ids_changed_input = input_batch[0].clone()
             #  We increment the token id on the changed index.
             token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[
@@ -208,6 +207,7 @@ def test_gpt_prefix(self):
             output_changed_input = model(token_ids_changed_input, *input_batch[1:])
 
             # All tokens should be changed
+            print(equal_vectors(output[0, :], output_changed_input[0, :]))
             self.assertFalse(
                 torch.any(
                     equal_vectors(output[0, :], output_changed_input[0, :])

From 076b69f2a1823187dea87bc2bb3d68211f716aa9 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Sat, 7 Aug 2021 18:28:29 +0200
Subject: [PATCH 25/30] fix test

---
 megatron/model/test/test_gpt_model.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index b6e8fe854..e74819308 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -106,8 +106,8 @@ def test_gpt_causal(self):
             changed_index = randint(0, args.seq_length - 2)
             input_token_ids_changed = input_batch[0].clone()
             # We increment the token_id by one for that index in order to artificially change the sequence.
-            input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:,
-                                                         changed_index] + 1) % args.padded_vocab_size
+            input_token_ids_changed[:, changed_index] = \
+                (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size
 
             output = model(*input_batch)
             output_changed = model(input_token_ids_changed, *input_batch[1:])
@@ -164,8 +164,8 @@ def test_gpt_prefix(self):
             changed_target_index = prefix_indices[0][0]
             token_ids_changed_target = input_batch[0].clone()
             # We increment the token id on the changed index.
-            token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[
-                                                                     0, changed_target_index] + 1) % args.padded_vocab_size
+            token_ids_changed_target[0, changed_target_index] = \
+                (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1
             token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size
@@ -198,8 +198,8 @@ def test_gpt_prefix(self):
             last_prefix_index = prefix_indices[0][0] - 1
             token_ids_changed_input = input_batch[0].clone()
             #  We increment the token id on the changed index.
-            token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[
-                                                                    0, last_prefix_index] + 1) % args.padded_vocab_size
+            token_ids_changed_input[0, last_prefix_index] = \
+                (token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size
             # make sure we're not changing a token to eod as it's a special token
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1
             token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size

From 2e0f71a33b8c39c78b7de728a6904810557541d8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Mon, 9 Aug 2021 11:18:16 +0200
Subject: [PATCH 26/30] Remove prefix-lm flag as it's integrated

---
 megatron/model/test/test_gpt_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py
index e74819308..b0a4b7cd4 100644
--- a/megatron/model/test/test_gpt_model.py
+++ b/megatron/model/test/test_gpt_model.py
@@ -130,7 +130,6 @@ def test_gpt_prefix(self):
         """
         command_args = get_default_args()
 
-        command_args["--prefix-lm"] = ""
         command_args["--reset-attention-mask"] = ""
 
         with patch('sys.argv', flatten_arguments(command_args)):

From 18b1c97cb71b9938cce3504296c3d0c49e1f8356 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 11:48:59 +0200
Subject: [PATCH 27/30] Woops

---
 tests/test_model.py         | 3 +++
 tests/test_preprocessing.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index f76556498..8dc05d887 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -74,6 +74,8 @@ def setUpClass(cls) -> None:
         deepspeed.init_distributed()
 
     def setUp(self) -> None:
+        super().setUp()
+
         # We reset all global variables
         global_vars._GLOBAL_ARGS = None
         global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None
@@ -132,6 +134,7 @@ def test_prefix_lm(self):
         command_args = get_default_args()
 
         command_args["--reset-attention-mask"] = ""
+        command_args["--loss-on-targets-only"] = ""
 
         with patch('sys.argv', flatten_arguments(command_args)):
             initialize_megatron()
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 0d323234e..0d48752e6 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -101,7 +101,7 @@ def test_process_data_microsoft(self):
         data_dir = f"{self.data_dir}/gpt2"
         output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False)
 
-        input_path = f"{self.tests_dir}/tools/openwebtext-1000.jsonl"
+        input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl"
 
         output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext"
 

From 76aad892bd71b40df592853984d447b3b912e044 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 12:00:26 +0200
Subject: [PATCH 28/30] Add test for without reset attention mask

---
 tests/test_model.py | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 8dc05d887..7c6357227 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -124,9 +124,9 @@ def test_gpt(self):
                 torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:]))
             )
 
-    def test_prefix_lm(self):
+    def test_prefix_lm_reset_attention_mask(self):
         """
-        Test prefix invariances:
+        Test prefix invariances when `reset_attention_mask=True`:
             - Past target tokens don't depend on future target tokens.
             - Target tokens depend on input tokens.
             - Input tokens depend on all other input tokens, but never target tokens.
@@ -210,7 +210,6 @@ def test_prefix_lm(self):
             output_changed_input = model(token_ids_changed_input, *input_batch[1:])
 
             # All tokens should be changed
-            print(equal_vectors(output[0, :], output_changed_input[0, :]))
             self.assertFalse(
                 torch.any(
                     equal_vectors(output[0, :], output_changed_input[0, :])
@@ -223,6 +222,39 @@ def test_prefix_lm(self):
                 )
             )
 
+    def test_prefix_lm_wo_reset_attention_mask(self):
+        """
+        Test prefix invariances when `reset_attention_mask=False`:
+            - Past target tokens don't depend on future target tokens.
+            - Target tokens depend on input tokens.
+            - Input tokens depend on all other input tokens, but never target tokens.
+        """
+        command_args = get_default_args()
+
+        command_args["--loss-on-targets-only"] = ""
+
+        with patch('sys.argv', flatten_arguments(command_args)):
+            initialize_megatron()
+            args = get_args()
+
+            model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider)
+            model = model[0]
+
+            token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
+
+            # process batch to have non empty prefix
+            for i in range(9, -1, -1):
+                input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids})
+                if (prefix_indices[0][0] != 0):
+                    break
+                if i == 0:
+                    # FIXME: find a better way to not obtain empty prefix
+                    raise ValueError("Could not obtain non pathological case where prefix is not empty")
+
+            model(*input_batch)
+
+            #TODO: Check all invariants
+
     def test_gpt_rotary_embeddings(self):
         """Test rotary embeddings"""
         command_args = get_default_args()
@@ -249,6 +281,8 @@ def test_gpt_rotary_embeddings(self):
 
             model(*input_batch)
 
+            #TODO: Check all invariants
+
 
 if __name__ == '__main__':
     unittest.main()

From 86f89283d56264aa678b071dff60370a214df40f Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 12:19:41 +0200
Subject: [PATCH 29/30] Fix test for non reset attention mask

---
 megatron/utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/megatron/utils.py b/megatron/utils.py
index 5e4f9e134..bc0446fc2 100644
--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -320,10 +320,10 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
 
     assert partial_prefix_indices is None or len(partial_prefix_indices) == micro_batch_size, f"partial_prefix_indices has to be None or its length equal to {micro_batch_size}, got {len(partial_prefix_indices)}"
     for batch_id in range(micro_batch_size):
-        prefix_indices.append([])
-
         # Prefix lm per document.
         if reset_attention_mask:
+            prefix_indices.append([])
+
             # Compute the index of all eod tokens in data.
             eod_indices = (data[batch_id] == eod_token).nonzero().squeeze(-1)
 
@@ -356,6 +356,7 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
             assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], int), \
                 f"Per document prefix has to store an int for each row, got {partial_prefix_indices[batch_id]}"
 
+            prefix_index: int
             if partial_prefix_indices is None or partial_prefix_indices[batch_id] is None:
                 # We need to randomly generate a prefix index
                 prefix_index = randint(0, seq_length - 1)
@@ -363,5 +364,6 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_
                 # We get value from partial_prefix_indices, and run validation on that value
                 prefix_index = partial_prefix_indices[batch_id]
                 assert 0 <= prefix_index < seq_length - 1, f"Prefix index needs to be between documents indices, 0 <= {prefix_index} < {seq_length - 1} should be True."
-            prefix_indices[batch_id].append(prefix_index)
+            prefix_indices.append(prefix_index)
+
     return prefix_indices

From fe4a81592b714468308ac108f5f48425544c6cc8 Mon Sep 17 00:00:00 2001
From: thomasw21 <24695242+thomasw21@users.noreply.github.com>
Date: Thu, 16 Sep 2021 13:44:10 +0200
Subject: [PATCH 30/30] Fix test

---
 tests/test_model.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/test_model.py b/tests/test_model.py
index 7c6357227..3079a429a 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -241,15 +241,7 @@ def test_prefix_lm_wo_reset_attention_mask(self):
             model = model[0]
 
             token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length))
-
-            # process batch to have non empty prefix
-            for i in range(9, -1, -1):
-                input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids})
-                if (prefix_indices[0][0] != 0):
-                    break
-                if i == 0:
-                    # FIXME: find a better way to not obtain empty prefix
-                    raise ValueError("Could not obtain non pathological case where prefix is not empty")
+            input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids})
 
             model(*input_batch)