From 516df5ee0dcdcd2a2cf90690871b47039bfd1eb2 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 10:56:59 +0200 Subject: [PATCH 01/30] WIP: test --- megatron/model/test/test_gpt_model.py | 73 +++++++++++++++++++++++++++ tests/test_basic.py | 2 + 2 files changed, 75 insertions(+) create mode 100644 megatron/model/test/test_gpt_model.py diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py new file mode 100644 index 000000000..3f5da24e3 --- /dev/null +++ b/megatron/model/test/test_gpt_model.py @@ -0,0 +1,73 @@ +import unittest +from unittest.mock import patch + +from megatron.model import GPTModel + +def default_args(): + """return a dictionary with key as argument name and value as additional arguments""" + VOCAB_FILE="" + MERGE_FILE="" + + CHECKPOINT_PATH="" + DATA_PATH="" + + return { + # GPT_ARGS + "--num-layers": "2", + "--hidden-size": "128", + "--num-attention-heads": "4", + "--seq-length": "256", + "--max-position-embeddings": "256", + "--micro-batch-size": "4", + "--global-batch-size": "8", + "--lr-decay-iters": "320000", + "--lr-decay-style": "cosine", + "--lr": "0.00015", + "--min-lr": "1.0e-5", + "--lr-decay-style": "cosine", + "--train-iters": "5000", + "--vocab-file": VOCAB_FILE, + "--merge-file": MERGE_FILE, + "--data-impl": "mmap", + "--split": "949,50,1", + "--distributed-backend": "nccl", + "--weight-decay": "1e-2", + "--clip-grad": "1.0", + "--lr-warmup-fraction": ".01", + "--fp16": "", + "--prefix-lm": "", + "--reset-attention-mask": "", + + # OUTPUT_ARGS + "--log-interval": "10", + "--save-interval": "500", + "--eval-interval": "100", + "--eval-iters": "10", + "--checkpoint-activations": "", + + # DATA_ARGS + "--save": CHECKPOINT_PATH, + "--load": CHECKPOINT_PATH, + "--data-path": DATA_PATH, + + } + +class MyTestCase(unittest.TestCase): + def test_gpt_causal(self): + """Test causal invariance, ie past token don't depend on future tokens.""" + with patch('sys.argv'. ['stuff']): + + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True + ) + self.assertEqual(True, False) # add assertion here + + def test_gpt_prefix(self): + """Test prefix invariance, ie past tokens in the target don't depend on future tokens.""" + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_basic.py b/tests/test_basic.py index 915d2c100..af5eee3cc 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,3 +1,5 @@ def test_import(): import megatron + + From eececbb3d7ca5c6b2cba741a2a7ae4ed3dd7c197 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 12:18:05 +0200 Subject: [PATCH 02/30] Still trying to figure out deepspeed --- megatron/model/test/test_gpt_model.py | 132 +++++++++++++++++++++++--- 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 3f5da24e3..9e3811d74 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -1,9 +1,15 @@ import unittest +from random import randint from unittest.mock import patch -from megatron.model import GPTModel +import torch -def default_args(): +from megatron import initialize_megatron, get_args, get_tokenizer +from megatron.model import GPTModelPipe +from pretrain_gpt import get_batch_pipe + + +def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" VOCAB_FILE="" MERGE_FILE="" @@ -12,6 +18,9 @@ def default_args(): DATA_PATH="" return { + # Deepspeed + "--deepspeed": "", + # GPT_ARGS "--num-layers": "2", "--hidden-size": "128", @@ -24,7 +33,6 @@ def default_args(): "--lr-decay-style": "cosine", "--lr": "0.00015", "--min-lr": "1.0e-5", - "--lr-decay-style": "cosine", "--train-iters": "5000", "--vocab-file": VOCAB_FILE, "--merge-file": MERGE_FILE, @@ -35,8 +43,6 @@ def default_args(): "--clip-grad": "1.0", "--lr-warmup-fraction": ".01", "--fp16": "", - "--prefix-lm": "", - "--reset-attention-mask": "", # OUTPUT_ARGS "--log-interval": "10", @@ -49,24 +55,122 @@ def default_args(): "--save": CHECKPOINT_PATH, "--load": CHECKPOINT_PATH, "--data-path": DATA_PATH, - } +def flatten_arguments(args): + """ + Converts dictionary argument to a list + + Example: {"arg1": "value1", "arg2": "value2"} -> ["arg1", "value1", "arg2", "value2"] + """ + return [item for key_value in args.items() for item in key_value] + class MyTestCase(unittest.TestCase): def test_gpt_causal(self): """Test causal invariance, ie past token don't depend on future tokens.""" - with patch('sys.argv'. ['stuff']): + command_args = get_default_args() + + with patch('sys.argv', flatten_arguments(command_args)): + initialize_megatron() + args = get_args() + tokenizer = get_tokenizer() + + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + ) + + token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) + + # eod is a special token + token_ids[token_ids == tokenizer.eod] += 1 + token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size + + # we set a variation on the inputs + changed_index = randint(0, args.seq_length - 1) + token_ids_changed = token_ids.clone() + token_ids_changed[changed_index] = (token_ids_changed[changed_index] + 1) % args.padded_vocab_size + + model.forward()[get_batch_pipe(token_ids)] + + position_ids = torch.arange(args.seq_length).unsqueeze(0) + attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) + + output = model(input_ids, position_ids, attention_mask)[0] + + + + output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + + # All token in past should be unchanged + self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=True, - post_process=True - ) - self.assertEqual(True, False) # add assertion here def test_gpt_prefix(self): """Test prefix invariance, ie past tokens in the target don't depend on future tokens.""" + command_args = get_default_args() + + command_args["--prefix-lm"] = "", + command_args["--reset-attention-mask"] = "", + + with patch('sys.argv', flatten_arguments(command_args)): + initialize_megatron() + args = get_args() + + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True + ) + + input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) + position_ids = torch.arange(args.seq_length).unsqueeze(0) + attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) + + output = model(input_ids, position_ids, attention_mask)[0] + + changed_index = randint(0, args.seq_length - 1) + input_ids_changed = input_ids.clone() + input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size + + output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + + # All token in past should be unchanged + self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) + + def test_gpt_rotary_embeddings(self): + """Test rotary embeddings""" + command_args = get_default_args() + + del command_args["--max-position-embeddings"] + command_args["--position-embedding-type"] = "rotary" + + with patch('sys.argv', flatten_arguments(command_args)): + initialize_megatron() + args = get_args() + + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True + ) + + input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) + position_ids = torch.arange(args.seq_length).unsqueeze(0) + attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) + + output = model(input_ids, position_ids, attention_mask)[0] + + changed_index = randint(0, args.seq_length - 1) + input_ids_changed = input_ids.clone() + input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size + + output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + + # All token in past should be unchanged + self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) if __name__ == '__main__': From b78dfaad3201460ebd79cd3bbd9a192169e90746 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 14:42:11 +0200 Subject: [PATCH 03/30] WIP --- megatron/model/test/test_gpt_model.py | 96 ++++++++++++--------------- pretrain_prefix_lm.py | 2 +- 2 files changed, 45 insertions(+), 53 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 9e3811d74..ababd81f6 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -3,11 +3,12 @@ from unittest.mock import patch import torch +from deepspeed import deepspeed from megatron import initialize_megatron, get_args, get_tokenizer from megatron.model import GPTModelPipe -from pretrain_gpt import get_batch_pipe - +from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe +from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" @@ -75,10 +76,7 @@ def test_gpt_causal(self): args = get_args() tokenizer = get_tokenizer() - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - ) + model_engine = deepspeed.init_inference(gpt_model_provider()) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -86,28 +84,28 @@ def test_gpt_causal(self): token_ids[token_ids == tokenizer.eod] += 1 token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - # we set a variation on the inputs - changed_index = randint(0, args.seq_length - 1) - token_ids_changed = token_ids.clone() - token_ids_changed[changed_index] = (token_ids_changed[changed_index] + 1) % args.padded_vocab_size - - model.forward()[get_batch_pipe(token_ids)] - - position_ids = torch.arange(args.seq_length).unsqueeze(0) - attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) - - output = model(input_ids, position_ids, attention_mask)[0] - + # process batch + input_batch = get_gpt_batch_pipe(token_ids)[0] + # get a modified version of the first batch + changed_index = randint(0, args.seq_length - 2) + input_token_ids_changed = input_batch[0].clone() + # We randomly increment the index by one of that index + input_token_ids_changed[changed_index] = (input_token_ids_changed[changed_index] + 1) % args.padded_vocab_size - output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + output = model_engine(input_batch) + output_changed = model_engine((input_token_ids_changed, *input_batch[1:])) # All token in past should be unchanged self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) def test_gpt_prefix(self): - """Test prefix invariance, ie past tokens in the target don't depend on future tokens.""" + """ + Test prefix invariances: + - Past tokens in the target don't depend on future tokens. + - Input tokens + """ command_args = get_default_args() command_args["--prefix-lm"] = "", @@ -116,25 +114,28 @@ def test_gpt_prefix(self): with patch('sys.argv', flatten_arguments(command_args)): initialize_megatron() args = get_args() + tokenizer = get_tokenizer() + + model_engine = deepspeed.init_inference(prefix_lm_model_provider()) - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - pre_process=True, - post_process=True - ) + token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - position_ids = torch.arange(args.seq_length).unsqueeze(0) - attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) + # eod is a special token + token_ids[token_ids == tokenizer.eod] += 1 + token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - output = model(input_ids, position_ids, attention_mask)[0] + # process batch + input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids) - changed_index = randint(0, args.seq_length - 1) - input_ids_changed = input_ids.clone() - input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size + # get a modified version of the first batch + changed_index = randint(0, args.seq_length - 2) + input_token_ids_changed = input_batch[0].clone() + # We randomly increment the index by one of that index + input_token_ids_changed[changed_index] = (input_token_ids_changed[ + changed_index] + 1) % args.padded_vocab_size - output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + output = model_engine(input_batch) + output_changed = model_engine((input_token_ids_changed, *input_batch[1:])) # All token in past should be unchanged self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) @@ -149,29 +150,20 @@ def test_gpt_rotary_embeddings(self): with patch('sys.argv', flatten_arguments(command_args)): initialize_megatron() args = get_args() + tokenizer = get_tokenizer() - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - pre_process=True, - post_process=True - ) - - input_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - position_ids = torch.arange(args.seq_length).unsqueeze(0) - attention_mask = torch.ones((args.micro_batch_size, 1, args.seq_length, args.seq_length)) - - output = model(input_ids, position_ids, attention_mask)[0] + model_engine = deepspeed.init_inference(gpt_model_provider()) - changed_index = randint(0, args.seq_length - 1) - input_ids_changed = input_ids.clone() - input_ids_changed[changed_index] = (input_ids_changed[changed_index] + 1) % args.padded_vocab_size + token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - output_changed = model(input_ids_changed, position_ids, attention_mask)[0] + # eod is a special token + token_ids[token_ids == tokenizer.eod] += 1 + token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - # All token in past should be unchanged - self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) + # process batch + input_batch = get_gpt_batch_pipe(token_ids)[0] + model_engine(input_batch) if __name__ == '__main__': unittest.main() diff --git a/pretrain_prefix_lm.py b/pretrain_prefix_lm.py index 0f4928021..5b168e8b9 100644 --- a/pretrain_prefix_lm.py +++ b/pretrain_prefix_lm.py @@ -145,7 +145,7 @@ def get_batch_pipe(data): loss_on_targets_only=args.loss_on_targets_only ) - return (tokens, position_ids, attention_mask), (labels, loss_mask) + return (tokens, position_ids, attention_mask), (labels, loss_mask), prefix_indices def loss_func(loss_mask, output_tensor): losses = output_tensor.float() From 0bfc2f4fb09ad95513d68c170ed630aac9c98c7e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 15:23:16 +0200 Subject: [PATCH 04/30] Test test --- megatron/model/test/test_gpt_model.py | 99 ++++++++++++++++++++++----- 1 file changed, 82 insertions(+), 17 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index ababd81f6..f5657c7f1 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -87,24 +87,35 @@ def test_gpt_causal(self): # process batch input_batch = get_gpt_batch_pipe(token_ids)[0] - # get a modified version of the first batch + # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) input_token_ids_changed = input_batch[0].clone() - # We randomly increment the index by one of that index - input_token_ids_changed[changed_index] = (input_token_ids_changed[changed_index] + 1) % args.padded_vocab_size + # We increment the token_id by one for that index in order to artificially change the sequence. + input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size output = model_engine(input_batch) output_changed = model_engine((input_token_ids_changed, *input_batch[1:])) # All token in past should be unchanged - self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) + self.assertTrue( + torch.all( + output[:, :changed_index].eq(output_changed[:, :changed_index]) + ) + ) + # All tokens in the future should have changed + self.assertFalse( + torch.any( + output[:, changed_index:].eq(output_changed[:, changed_index:]) + ) + ) def test_gpt_prefix(self): """ Test prefix invariances: - - Past tokens in the target don't depend on future tokens. - - Input tokens + - Past target tokens don't depend on future target tokens. + - Target tokens depend on input tokens. + - Input tokens depend on all other input tokens, but never target tokens. """ command_args = get_default_args() @@ -120,25 +131,79 @@ def test_gpt_prefix(self): token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - # eod is a special token + # eod is a special token, this also guarantees that the whole row is considered as a document. token_ids[token_ids == tokenizer.eod] += 1 token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size - # process batch - input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids) + # process batch to have non empty prefix + for i in range(9, -1, -1): + input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids) + if (prefix_indices[0][0] != 0): + break + if i == 0: + # FIXME: find a better way to not obtain empty prefix + raise ValueError("Could not obtain non pathological case where prefix is not empty") + + output = model_engine(input_batch) + ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch - changed_index = randint(0, args.seq_length - 2) - input_token_ids_changed = input_batch[0].clone() - # We randomly increment the index by one of that index - input_token_ids_changed[changed_index] = (input_token_ids_changed[ - changed_index] + 1) % args.padded_vocab_size + changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document + token_ids_changed_target = input_batch[0].clone() + # We increment the token id on the changed index. + token_ids_changed_target[changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size + # make sure we're not changing a token to eod as it's a special token + token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1 + token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size - output = model_engine(input_batch) - output_changed = model_engine((input_token_ids_changed, *input_batch[1:])) + # Test change + output_changed_target = model_engine((token_ids_changed_target, *input_batch[1:])) # All token in past should be unchanged - self.assertEqual(output[:, :changed_index], output_changed[:, :changed_index]) + self.assertTrue( + torch.all( + output[0, :changed_target_index].eq(output_changed_target[0, :changed_target_index]) + ) + ) + # All tokens in the future should have changed + self.assertFalse( + torch.any( + output[0, changed_target_index:].eq(output_changed_target[0, changed_target_index:]) + ) + ) + # Unchanged changed rows should not change either + self.assertTrue( + torch.all( + output[1, :].eq(output_changed_target[1, :]) + ) + ) + + ## --------------- CHANGE AN INPUT TOKEN --------------------------- + # Let's change the the last prefix token and make sure that the first token changed + last_prefix_index = prefix_indices[0][0] - 1 # guaranteed to be positive as we avoid pathological case previously + token_ids_changed_input = input_batch[0].clone() + # We increment the token id on the changed index. + token_ids_changed_input[changed_target_index] = (token_ids_changed_input[ + 0, last_prefix_index] + 1) % args.padded_vocab_size + # make sure we're not changing a token to eod as it's a special token + token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 + token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size + + output_changed_input = model_engine((token_ids_changed_input, *input_batch[1:])) + + # All tokens should be changed + self.assertFalse( + torch.any( + output[0, :].eq(output_changed_input[0, :]) + ) + ) + # Unchanged changed rows should not change either + self.assertTrue( + torch.all( + output[1, :].eq(output_changed_input[1, :]) + ) + ) + def test_gpt_rotary_embeddings(self): """Test rotary embeddings""" From b8016371a395c23380dc9975a51a55e9d4c992da Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 19:16:04 +0200 Subject: [PATCH 05/30] Test how to setup deepspeed in unit tests --- megatron/model/test/test_gpt_model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index f5657c7f1..401eb958d 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -19,9 +19,6 @@ def get_default_args(): DATA_PATH="" return { - # Deepspeed - "--deepspeed": "", - # GPT_ARGS "--num-layers": "2", "--hidden-size": "128", From 2b81d406e53c646ba380bc8f4c9e6c68c0d6e85b Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 21:53:28 +0200 Subject: [PATCH 06/30] Test something else --- megatron/model/test/test_gpt_model.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 401eb958d..f6a3f5710 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -6,7 +6,6 @@ from deepspeed import deepspeed from megatron import initialize_megatron, get_args, get_tokenizer -from megatron.model import GPTModelPipe from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe @@ -57,11 +56,13 @@ def get_default_args(): def flatten_arguments(args): """ - Converts dictionary argument to a list + Converts dictionary argument to a list. - Example: {"arg1": "value1", "arg2": "value2"} -> ["arg1", "value1", "arg2", "value2"] + Note: we add "IGNORED" at the beginning as this value is ignored by the argparser + + Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"] """ - return [item for key_value in args.items() for item in key_value] + return ["IGNORED"] + [item for key_value in args.items() for item in key_value] class MyTestCase(unittest.TestCase): def test_gpt_causal(self): From aeca8c146d033e7a1fa8b92da0bd603546c8c7c3 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 22:10:06 +0200 Subject: [PATCH 07/30] Empty strings might be problematic --- megatron/model/test/test_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index f6a3f5710..4c7e3c416 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -62,7 +62,7 @@ def flatten_arguments(args): Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"] """ - return ["IGNORED"] + [item for key_value in args.items() for item in key_value] + return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] class MyTestCase(unittest.TestCase): def test_gpt_causal(self): From 520ef720ca8accbc950375720becc11152330b09 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 22:25:34 +0200 Subject: [PATCH 08/30] Remove unecessary arguments --- megatron/model/test/test_gpt_model.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 4c7e3c416..9417d2fee 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -11,12 +11,6 @@ def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" - VOCAB_FILE="" - MERGE_FILE="" - - CHECKPOINT_PATH="" - DATA_PATH="" - return { # GPT_ARGS "--num-layers": "2", @@ -31,8 +25,8 @@ def get_default_args(): "--lr": "0.00015", "--min-lr": "1.0e-5", "--train-iters": "5000", - "--vocab-file": VOCAB_FILE, - "--merge-file": MERGE_FILE, + "--tokenizer": "PretrainedFromHF", + "--tokenizer-name-or-path": "gpt2", "--data-impl": "mmap", "--split": "949,50,1", "--distributed-backend": "nccl", @@ -49,9 +43,6 @@ def get_default_args(): "--checkpoint-activations": "", # DATA_ARGS - "--save": CHECKPOINT_PATH, - "--load": CHECKPOINT_PATH, - "--data-path": DATA_PATH, } def flatten_arguments(args): From 37522b41c0edfce359f1890ec70764d13be0257e Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 22:55:35 +0200 Subject: [PATCH 09/30] Woops --- megatron/model/test/test_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 9417d2fee..9c2b0e60d 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -25,7 +25,7 @@ def get_default_args(): "--lr": "0.00015", "--min-lr": "1.0e-5", "--train-iters": "5000", - "--tokenizer": "PretrainedFromHF", + "--tokenizer-type": "PretrainedFromHF", "--tokenizer-name-or-path": "gpt2", "--data-impl": "mmap", "--split": "949,50,1", From 76f01fecd5dfaa475c39b088ad5944ee905298bb Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 23:16:38 +0200 Subject: [PATCH 10/30] Remove global variables at the end of each test and init deepspeed --- megatron/model/test/test_gpt_model.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 9c2b0e60d..16573a827 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -56,6 +56,25 @@ def flatten_arguments(args): return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] class MyTestCase(unittest.TestCase): + def setUpClass(cls) -> None: + deepspeed.init_distributed() + + def tearDown(self) -> None: + # We reset all global variables + global _GLOBAL_ARGS + global _GLOBAL_NUM_MICROBATCHES_CALCULATOR + global _GLOBAL_TOKENIZER + global _GLOBAL_TENSORBOARD_WRITER + global _GLOBAL_ADLR_AUTORESUME + global _GLOBAL_TIMERS + + _GLOBAL_ARGS = None + _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + _GLOBAL_TOKENIZER = None + _GLOBAL_TENSORBOARD_WRITER = None + _GLOBAL_ADLR_AUTORESUME = None + _GLOBAL_TIMERS = None + def test_gpt_causal(self): """Test causal invariance, ie past token don't depend on future tokens.""" command_args = get_default_args() From 188b33b60e0f05c59e650a427d7842949c1311b0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 23:17:59 +0200 Subject: [PATCH 11/30] Woops --- megatron/model/test/test_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 16573a827..0db89f02c 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -56,7 +56,7 @@ def flatten_arguments(args): return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] class MyTestCase(unittest.TestCase): - def setUpClass(cls) -> None: + def setUpClass(self) -> None: deepspeed.init_distributed() def tearDown(self) -> None: From 57191c4038a17b4ef93d39c05415df4417acbd09 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Fri, 6 Aug 2021 23:20:00 +0200 Subject: [PATCH 12/30] Maybe adding classmethod --- megatron/model/test/test_gpt_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 0db89f02c..6cdf14185 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -56,7 +56,8 @@ def flatten_arguments(args): return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] class MyTestCase(unittest.TestCase): - def setUpClass(self) -> None: + @classmethod + def setUpClass(cls) -> None: deepspeed.init_distributed() def tearDown(self) -> None: From 1389e6dc30841838552ebae6299b8e1bacf49543 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 03:16:08 +0200 Subject: [PATCH 13/30] Woops --- megatron/model/test/test_gpt_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 6cdf14185..465305961 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -2,8 +2,8 @@ from random import randint from unittest.mock import patch +import deepspeed import torch -from deepspeed import deepspeed from megatron import initialize_megatron, get_args, get_tokenizer from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe From e45854085c3645a6f96ff9088e8a45f1d54ac5e8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 03:19:04 +0200 Subject: [PATCH 14/30] Add debug print to check that tear down happends --- megatron/model/test/test_gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 465305961..43b736d6f 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -62,6 +62,7 @@ def setUpClass(cls) -> None: def tearDown(self) -> None: # We reset all global variables + print("Tearing down args.") global _GLOBAL_ARGS global _GLOBAL_NUM_MICROBATCHES_CALCULATOR global _GLOBAL_TOKENIZER From d7f331f06d502a68679a40b594116eb962f4d542 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 03:25:27 +0200 Subject: [PATCH 15/30] Reset global variables before --- megatron/model/test/test_gpt_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 43b736d6f..0820d9075 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -60,9 +60,8 @@ class MyTestCase(unittest.TestCase): def setUpClass(cls) -> None: deepspeed.init_distributed() - def tearDown(self) -> None: + def setUp(self) -> None: # We reset all global variables - print("Tearing down args.") global _GLOBAL_ARGS global _GLOBAL_NUM_MICROBATCHES_CALCULATOR global _GLOBAL_TOKENIZER From af9a7164411a81c05296e6060cbbfb31e7902453 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:10:20 +0200 Subject: [PATCH 16/30] Let's test this --- megatron/model/test/test_gpt_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 0820d9075..2d88c658f 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -1,3 +1,4 @@ +import argparse import unittest from random import randint from unittest.mock import patch @@ -6,6 +7,7 @@ import torch from megatron import initialize_megatron, get_args, get_tokenizer +from megatron.training import setup_model_and_optimizer from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe @@ -85,7 +87,7 @@ def test_gpt_causal(self): args = get_args() tokenizer = get_tokenizer() - model_engine = deepspeed.init_inference(gpt_model_provider()) + model, _, _ = setup_model_and_optimizer(gpt_model_provider) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -239,5 +241,10 @@ def test_gpt_rotary_embeddings(self): model_engine(input_batch) +def get_deepspeed_args(): + parser = argparse.ArgumentParser() + return deepspeed.add_config_arguments(parser) + if __name__ == '__main__': + get_deepspeed_args() unittest.main() From b90812497104036f7936edc96c634ed7f148cd3c Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:21:15 +0200 Subject: [PATCH 17/30] Try something else --- megatron/model/test/test_gpt_model.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 2d88c658f..956d32b3b 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -6,7 +6,7 @@ import deepspeed import torch -from megatron import initialize_megatron, get_args, get_tokenizer +from megatron import initialize_megatron, get_args, get_tokenizer, global_vars from megatron.training import setup_model_and_optimizer from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe @@ -64,19 +64,12 @@ def setUpClass(cls) -> None: def setUp(self) -> None: # We reset all global variables - global _GLOBAL_ARGS - global _GLOBAL_NUM_MICROBATCHES_CALCULATOR - global _GLOBAL_TOKENIZER - global _GLOBAL_TENSORBOARD_WRITER - global _GLOBAL_ADLR_AUTORESUME - global _GLOBAL_TIMERS - - _GLOBAL_ARGS = None - _GLOBAL_NUM_MICROBATCHES_CALCULATOR = None - _GLOBAL_TOKENIZER = None - _GLOBAL_TENSORBOARD_WRITER = None - _GLOBAL_ADLR_AUTORESUME = None - _GLOBAL_TIMERS = None + global_vars._GLOBAL_ARGS = None + global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None + global_vars._GLOBAL_TOKENIZER = None + global_vars._GLOBAL_TENSORBOARD_WRITER = None + global_vars._GLOBAL_ADLR_AUTORESUME = None + global_vars._GLOBAL_TIMERS = None def test_gpt_causal(self): """Test causal invariance, ie past token don't depend on future tokens.""" @@ -96,7 +89,7 @@ def test_gpt_causal(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = get_gpt_batch_pipe(token_ids)[0] + input_batch = get_gpt_batch_pipe({"text": token_ids})[0] # get a modified version of the first batch, we change a specific index changed_index = randint(0, args.seq_length - 2) @@ -104,8 +97,8 @@ def test_gpt_causal(self): # We increment the token_id by one for that index in order to artificially change the sequence. input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size - output = model_engine(input_batch) - output_changed = model_engine((input_token_ids_changed, *input_batch[1:])) + output = model(input_batch) + output_changed = model((input_token_ids_changed, *input_batch[1:])) # All token in past should be unchanged self.assertTrue( From 28cea9564037f1eb6a96eb28967d9f77bd8feb09 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:26:07 +0200 Subject: [PATCH 18/30] WIP --- megatron/model/test/test_gpt_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 956d32b3b..c5760db44 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -95,7 +95,7 @@ def test_gpt_causal(self): changed_index = randint(0, args.seq_length - 2) input_token_ids_changed = input_batch[0].clone() # We increment the token_id by one for that index in order to artificially change the sequence. - input_token_ids_changed[changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size + input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size output = model(input_batch) output_changed = model((input_token_ids_changed, *input_batch[1:])) @@ -131,7 +131,7 @@ def test_gpt_prefix(self): args = get_args() tokenizer = get_tokenizer() - model_engine = deepspeed.init_inference(prefix_lm_model_provider()) + model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -148,7 +148,7 @@ def test_gpt_prefix(self): # FIXME: find a better way to not obtain empty prefix raise ValueError("Could not obtain non pathological case where prefix is not empty") - output = model_engine(input_batch) + output = model(input_batch) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -161,7 +161,7 @@ def test_gpt_prefix(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model_engine((token_ids_changed_target, *input_batch[1:])) + output_changed_target = model((token_ids_changed_target, *input_batch[1:])) # All token in past should be unchanged self.assertTrue( @@ -193,7 +193,7 @@ def test_gpt_prefix(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model_engine((token_ids_changed_input, *input_batch[1:])) + output_changed_input = model((token_ids_changed_input, *input_batch[1:])) # All tokens should be changed self.assertFalse( @@ -221,7 +221,7 @@ def test_gpt_rotary_embeddings(self): args = get_args() tokenizer = get_tokenizer() - model_engine = deepspeed.init_inference(gpt_model_provider()) + model, _, _ = setup_model_and_optimizer(gpt_model_provider) token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -232,7 +232,7 @@ def test_gpt_rotary_embeddings(self): # process batch input_batch = get_gpt_batch_pipe(token_ids)[0] - model_engine(input_batch) + model(input_batch) def get_deepspeed_args(): parser = argparse.ArgumentParser() From 642ef91de06fa964f323a69dbe949bcb9f295f14 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:35:42 +0200 Subject: [PATCH 19/30] More fix --- megatron/model/test/test_gpt_model.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index c5760db44..c9203fdf2 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -37,6 +37,9 @@ def get_default_args(): "--lr-warmup-fraction": ".01", "--fp16": "", + "--attention-dropout": "0", + "--hidden-dropout": "0", + # OUTPUT_ARGS "--log-interval": "10", "--save-interval": "500", @@ -81,6 +84,7 @@ def test_gpt_causal(self): tokenizer = get_tokenizer() model, _, _ = setup_model_and_optimizer(gpt_model_provider) + model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -123,8 +127,8 @@ def test_gpt_prefix(self): """ command_args = get_default_args() - command_args["--prefix-lm"] = "", - command_args["--reset-attention-mask"] = "", + command_args["--prefix-lm"] = "" + command_args["--reset-attention-mask"] = "" with patch('sys.argv', flatten_arguments(command_args)): initialize_megatron() @@ -132,6 +136,7 @@ def test_gpt_prefix(self): tokenizer = get_tokenizer() model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider) + model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -141,7 +146,7 @@ def test_gpt_prefix(self): # process batch to have non empty prefix for i in range(9, -1, -1): - input_batch, _, prefix_indices = get_prefix_lm_batch_pipe(token_ids) + input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) if (prefix_indices[0][0] != 0): break if i == 0: @@ -222,6 +227,7 @@ def test_gpt_rotary_embeddings(self): tokenizer = get_tokenizer() model, _, _ = setup_model_and_optimizer(gpt_model_provider) + model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) @@ -230,7 +236,7 @@ def test_gpt_rotary_embeddings(self): token_ids[token_ids == tokenizer.eod] %= args.padded_vocab_size # process batch - input_batch = get_gpt_batch_pipe(token_ids)[0] + input_batch = get_gpt_batch_pipe({"text": token_ids})[0] model(input_batch) From 5143ce68effb53c354f9a7e0cdf0247bb5dba9a0 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:37:44 +0200 Subject: [PATCH 20/30] More fix --- megatron/model/test/test_gpt_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index c9203fdf2..09f985b9b 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -101,8 +101,8 @@ def test_gpt_causal(self): # We increment the token_id by one for that index in order to artificially change the sequence. input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size - output = model(input_batch) - output_changed = model((input_token_ids_changed, *input_batch[1:])) + output = model(*input_batch) + output_changed = model(input_token_ids_changed, *input_batch[1:]) # All token in past should be unchanged self.assertTrue( @@ -153,7 +153,7 @@ def test_gpt_prefix(self): # FIXME: find a better way to not obtain empty prefix raise ValueError("Could not obtain non pathological case where prefix is not empty") - output = model(input_batch) + output = model(*input_batch) ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch @@ -166,7 +166,7 @@ def test_gpt_prefix(self): token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size # Test change - output_changed_target = model((token_ids_changed_target, *input_batch[1:])) + output_changed_target = model(token_ids_changed_target, *input_batch[1:]) # All token in past should be unchanged self.assertTrue( @@ -198,7 +198,7 @@ def test_gpt_prefix(self): token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size - output_changed_input = model((token_ids_changed_input, *input_batch[1:])) + output_changed_input = model(token_ids_changed_input, *input_batch[1:]) # All tokens should be changed self.assertFalse( @@ -238,7 +238,7 @@ def test_gpt_rotary_embeddings(self): # process batch input_batch = get_gpt_batch_pipe({"text": token_ids})[0] - model(input_batch) + model(*input_batch) def get_deepspeed_args(): parser = argparse.ArgumentParser() From 8cfb92ce95dad0b32c9368ea7132d3853cec565d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 17:41:53 +0200 Subject: [PATCH 21/30] More stuff to fix --- megatron/model/test/test_gpt_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 09f985b9b..f8e80f87a 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -111,6 +111,10 @@ def test_gpt_causal(self): ) ) # All tokens in the future should have changed + print(torch.any( + output[:, changed_index:].eq(output_changed[:, changed_index:]) + ) + ) self.assertFalse( torch.any( output[:, changed_index:].eq(output_changed[:, changed_index:]) @@ -160,7 +164,7 @@ def test_gpt_prefix(self): changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document token_ids_changed_target = input_batch[0].clone() # We increment the token id on the changed index. - token_ids_changed_target[changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size + token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1 token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size @@ -192,7 +196,7 @@ def test_gpt_prefix(self): last_prefix_index = prefix_indices[0][0] - 1 # guaranteed to be positive as we avoid pathological case previously token_ids_changed_input = input_batch[0].clone() # We increment the token id on the changed index. - token_ids_changed_input[changed_target_index] = (token_ids_changed_input[ + token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[ 0, last_prefix_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 From 9dbd9399590cf74cbe9809e4e28aab5d8385373d Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 18:03:29 +0200 Subject: [PATCH 22/30] We really want to compare vectors and not coordinates --- megatron/model/test/test_gpt_model.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index f8e80f87a..e5f8040db 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -60,6 +60,10 @@ def flatten_arguments(args): """ return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] +def equal_vectors(tensor1, tensor2, dim = -1): + """View tensor1 and tensor2 as a list of vectors, and compute equality""" + return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0 + class MyTestCase(unittest.TestCase): @classmethod def setUpClass(cls) -> None: @@ -106,19 +110,11 @@ def test_gpt_causal(self): # All token in past should be unchanged self.assertTrue( - torch.all( - output[:, :changed_index].eq(output_changed[:, :changed_index]) - ) + torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index])) ) # All tokens in the future should have changed - print(torch.any( - output[:, changed_index:].eq(output_changed[:, changed_index:]) - ) - ) self.assertFalse( - torch.any( - output[:, changed_index:].eq(output_changed[:, changed_index:]) - ) + torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) ) @@ -175,19 +171,19 @@ def test_gpt_prefix(self): # All token in past should be unchanged self.assertTrue( torch.all( - output[0, :changed_target_index].eq(output_changed_target[0, :changed_target_index]) + equal_vectors(output[0, :changed_target_index], output_changed_target[0, :changed_target_index]) ) ) # All tokens in the future should have changed self.assertFalse( torch.any( - output[0, changed_target_index:].eq(output_changed_target[0, changed_target_index:]) + equal_vectors(output[0, changed_target_index:], output_changed_target[0, changed_target_index:]) ) ) # Unchanged changed rows should not change either self.assertTrue( torch.all( - output[1, :].eq(output_changed_target[1, :]) + equal_vectors(output[1, :], output_changed_target[1, :]) ) ) @@ -207,13 +203,13 @@ def test_gpt_prefix(self): # All tokens should be changed self.assertFalse( torch.any( - output[0, :].eq(output_changed_input[0, :]) + equal_vectors(output[0, :], output_changed_input[0, :]) ) ) # Unchanged changed rows should not change either self.assertTrue( torch.all( - output[1, :].eq(output_changed_input[1, :]) + equal_vectors(output[1, :], output_changed_input[1, :]) ) ) From 82c6ca1d05e7cdb842ff0daa7d4e97c71f02fa41 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 18:05:55 +0200 Subject: [PATCH 23/30] Reformat --- megatron/model/test/test_gpt_model.py | 28 ++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index e5f8040db..2fe66d078 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -1,4 +1,3 @@ -import argparse import unittest from random import randint from unittest.mock import patch @@ -11,6 +10,7 @@ from pretrain_gpt import model_provider as gpt_model_provider, get_batch_pipe as get_gpt_batch_pipe from pretrain_prefix_lm import model_provider as prefix_lm_model_provider, get_batch_pipe as get_prefix_lm_batch_pipe + def get_default_args(): """return a dictionary with key as argument name and value as additional arguments""" return { @@ -50,6 +50,7 @@ def get_default_args(): # DATA_ARGS } + def flatten_arguments(args): """ Converts dictionary argument to a list. @@ -60,10 +61,12 @@ def flatten_arguments(args): """ return ["IGNORED"] + [item for key_value in args.items() for item in key_value if item != ""] -def equal_vectors(tensor1, tensor2, dim = -1): + +def equal_vectors(tensor1, tensor2, dim=-1): """View tensor1 and tensor2 as a list of vectors, and compute equality""" return torch.linalg.norm(tensor1 - tensor2, dim=dim) == 0 + class MyTestCase(unittest.TestCase): @classmethod def setUpClass(cls) -> None: @@ -103,7 +106,8 @@ def test_gpt_causal(self): changed_index = randint(0, args.seq_length - 2) input_token_ids_changed = input_batch[0].clone() # We increment the token_id by one for that index in order to artificially change the sequence. - input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, changed_index] + 1) % args.padded_vocab_size + input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, + changed_index] + 1) % args.padded_vocab_size output = model(*input_batch) output_changed = model(input_token_ids_changed, *input_batch[1:]) @@ -113,11 +117,11 @@ def test_gpt_causal(self): torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index])) ) # All tokens in the future should have changed + print(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) self.assertFalse( torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) ) - def test_gpt_prefix(self): """ Test prefix invariances: @@ -157,10 +161,12 @@ def test_gpt_prefix(self): ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch - changed_target_index = prefix_indices[0][0] # guaranteed to exist as each row has at least one partial document + changed_target_index = prefix_indices[0][ + 0] # guaranteed to exist as each row has at least one partial document token_ids_changed_target = input_batch[0].clone() # We increment the token id on the changed index. - token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size + token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[ + 0, changed_target_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1 token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size @@ -189,11 +195,12 @@ def test_gpt_prefix(self): ## --------------- CHANGE AN INPUT TOKEN --------------------------- # Let's change the the last prefix token and make sure that the first token changed - last_prefix_index = prefix_indices[0][0] - 1 # guaranteed to be positive as we avoid pathological case previously + last_prefix_index = prefix_indices[0][ + 0] - 1 # guaranteed to be positive as we avoid pathological case previously token_ids_changed_input = input_batch[0].clone() # We increment the token id on the changed index. token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[ - 0, last_prefix_index] + 1) % args.padded_vocab_size + 0, last_prefix_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size @@ -213,7 +220,6 @@ def test_gpt_prefix(self): ) ) - def test_gpt_rotary_embeddings(self): """Test rotary embeddings""" command_args = get_default_args() @@ -240,10 +246,6 @@ def test_gpt_rotary_embeddings(self): model(*input_batch) -def get_deepspeed_args(): - parser = argparse.ArgumentParser() - return deepspeed.add_config_arguments(parser) if __name__ == '__main__': - get_deepspeed_args() unittest.main() From 7c6ea150d147e89481a5d9bd4561901c9667d38f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 18:21:56 +0200 Subject: [PATCH 24/30] check something out --- megatron/model/test/test_gpt_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index 2fe66d078..b6e8fe854 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -117,7 +117,6 @@ def test_gpt_causal(self): torch.all(equal_vectors(output[:, :changed_index], output_changed[:, :changed_index])) ) # All tokens in the future should have changed - print(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) self.assertFalse( torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) ) @@ -161,8 +160,8 @@ def test_gpt_prefix(self): ## --------------- CHANGE A TARGET TOKEN --------------------------- # get a modified version of the first batch - changed_target_index = prefix_indices[0][ - 0] # guaranteed to exist as each row has at least one partial document + # guaranteed to exist as each row has at least one partial document + changed_target_index = prefix_indices[0][0] token_ids_changed_target = input_batch[0].clone() # We increment the token id on the changed index. token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[ @@ -195,8 +194,8 @@ def test_gpt_prefix(self): ## --------------- CHANGE AN INPUT TOKEN --------------------------- # Let's change the the last prefix token and make sure that the first token changed - last_prefix_index = prefix_indices[0][ - 0] - 1 # guaranteed to be positive as we avoid pathological case previously + # guaranteed to be positive as we avoid pathological case previously + last_prefix_index = prefix_indices[0][0] - 1 token_ids_changed_input = input_batch[0].clone() # We increment the token id on the changed index. token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[ @@ -208,6 +207,7 @@ def test_gpt_prefix(self): output_changed_input = model(token_ids_changed_input, *input_batch[1:]) # All tokens should be changed + print(equal_vectors(output[0, :], output_changed_input[0, :])) self.assertFalse( torch.any( equal_vectors(output[0, :], output_changed_input[0, :]) From 076b69f2a1823187dea87bc2bb3d68211f716aa9 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Sat, 7 Aug 2021 18:28:29 +0200 Subject: [PATCH 25/30] fix test --- megatron/model/test/test_gpt_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index b6e8fe854..e74819308 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -106,8 +106,8 @@ def test_gpt_causal(self): changed_index = randint(0, args.seq_length - 2) input_token_ids_changed = input_batch[0].clone() # We increment the token_id by one for that index in order to artificially change the sequence. - input_token_ids_changed[:, changed_index] = (input_token_ids_changed[:, - changed_index] + 1) % args.padded_vocab_size + input_token_ids_changed[:, changed_index] = \ + (input_token_ids_changed[:,changed_index] + 1) % args.padded_vocab_size output = model(*input_batch) output_changed = model(input_token_ids_changed, *input_batch[1:]) @@ -164,8 +164,8 @@ def test_gpt_prefix(self): changed_target_index = prefix_indices[0][0] token_ids_changed_target = input_batch[0].clone() # We increment the token id on the changed index. - token_ids_changed_target[0, changed_target_index] = (token_ids_changed_target[ - 0, changed_target_index] + 1) % args.padded_vocab_size + token_ids_changed_target[0, changed_target_index] = \ + (token_ids_changed_target[0, changed_target_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_target[token_ids_changed_target == tokenizer.eod] += 1 token_ids_changed_target[token_ids_changed_target == tokenizer.eod] %= args.padded_vocab_size @@ -198,8 +198,8 @@ def test_gpt_prefix(self): last_prefix_index = prefix_indices[0][0] - 1 token_ids_changed_input = input_batch[0].clone() # We increment the token id on the changed index. - token_ids_changed_input[0, changed_target_index] = (token_ids_changed_input[ - 0, last_prefix_index] + 1) % args.padded_vocab_size + token_ids_changed_input[0, last_prefix_index] = \ + (token_ids_changed_input[0, last_prefix_index] + 1) % args.padded_vocab_size # make sure we're not changing a token to eod as it's a special token token_ids_changed_input[token_ids_changed_input == tokenizer.eod] += 1 token_ids_changed_input[token_ids_changed_input == tokenizer.eod] %= args.padded_vocab_size From 2e0f71a33b8c39c78b7de728a6904810557541d8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Mon, 9 Aug 2021 11:18:16 +0200 Subject: [PATCH 26/30] Remove prefix-lm flag as it's integrated --- megatron/model/test/test_gpt_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/model/test/test_gpt_model.py b/megatron/model/test/test_gpt_model.py index e74819308..b0a4b7cd4 100644 --- a/megatron/model/test/test_gpt_model.py +++ b/megatron/model/test/test_gpt_model.py @@ -130,7 +130,6 @@ def test_gpt_prefix(self): """ command_args = get_default_args() - command_args["--prefix-lm"] = "" command_args["--reset-attention-mask"] = "" with patch('sys.argv', flatten_arguments(command_args)): From 18b1c97cb71b9938cce3504296c3d0c49e1f8356 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 11:48:59 +0200 Subject: [PATCH 27/30] Woops --- tests/test_model.py | 3 +++ tests/test_preprocessing.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_model.py b/tests/test_model.py index f76556498..8dc05d887 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -74,6 +74,8 @@ def setUpClass(cls) -> None: deepspeed.init_distributed() def setUp(self) -> None: + super().setUp() + # We reset all global variables global_vars._GLOBAL_ARGS = None global_vars._GLOBAL_NUM_MICROBATCHES_CALCULATOR = None @@ -132,6 +134,7 @@ def test_prefix_lm(self): command_args = get_default_args() command_args["--reset-attention-mask"] = "" + command_args["--loss-on-targets-only"] = "" with patch('sys.argv', flatten_arguments(command_args)): initialize_megatron() diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 0d323234e..0d48752e6 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -101,7 +101,7 @@ def test_process_data_microsoft(self): data_dir = f"{self.data_dir}/gpt2" output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False) - input_path = f"{self.tests_dir}/tools/openwebtext-1000.jsonl" + input_path = f"{self.tests_dir}/data/gpt2/openwebtext-1000.jsonl" output_prefix = f"{output_dir}/test-ds-meg-gpt2-openwebtext" From 76aad892bd71b40df592853984d447b3b912e044 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 12:00:26 +0200 Subject: [PATCH 28/30] Add test for without reset attention mask --- tests/test_model.py | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 8dc05d887..7c6357227 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -124,9 +124,9 @@ def test_gpt(self): torch.any(equal_vectors(output[:, changed_index:], output_changed[:, changed_index:])) ) - def test_prefix_lm(self): + def test_prefix_lm_reset_attention_mask(self): """ - Test prefix invariances: + Test prefix invariances when `reset_attention_mask=True`: - Past target tokens don't depend on future target tokens. - Target tokens depend on input tokens. - Input tokens depend on all other input tokens, but never target tokens. @@ -210,7 +210,6 @@ def test_prefix_lm(self): output_changed_input = model(token_ids_changed_input, *input_batch[1:]) # All tokens should be changed - print(equal_vectors(output[0, :], output_changed_input[0, :])) self.assertFalse( torch.any( equal_vectors(output[0, :], output_changed_input[0, :]) @@ -223,6 +222,39 @@ def test_prefix_lm(self): ) ) + def test_prefix_lm_wo_reset_attention_mask(self): + """ + Test prefix invariances when `reset_attention_mask=False`: + - Past target tokens don't depend on future target tokens. + - Target tokens depend on input tokens. + - Input tokens depend on all other input tokens, but never target tokens. + """ + command_args = get_default_args() + + command_args["--loss-on-targets-only"] = "" + + with patch('sys.argv', flatten_arguments(command_args)): + initialize_megatron() + args = get_args() + + model, _, _ = setup_model_and_optimizer(prefix_lm_model_provider) + model = model[0] + + token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) + + # process batch to have non empty prefix + for i in range(9, -1, -1): + input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) + if (prefix_indices[0][0] != 0): + break + if i == 0: + # FIXME: find a better way to not obtain empty prefix + raise ValueError("Could not obtain non pathological case where prefix is not empty") + + model(*input_batch) + + #TODO: Check all invariants + def test_gpt_rotary_embeddings(self): """Test rotary embeddings""" command_args = get_default_args() @@ -249,6 +281,8 @@ def test_gpt_rotary_embeddings(self): model(*input_batch) + #TODO: Check all invariants + if __name__ == '__main__': unittest.main() From 86f89283d56264aa678b071dff60370a214df40f Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 12:19:41 +0200 Subject: [PATCH 29/30] Fix test for non reset attention mask --- megatron/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/utils.py b/megatron/utils.py index 5e4f9e134..bc0446fc2 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -320,10 +320,10 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_ assert partial_prefix_indices is None or len(partial_prefix_indices) == micro_batch_size, f"partial_prefix_indices has to be None or its length equal to {micro_batch_size}, got {len(partial_prefix_indices)}" for batch_id in range(micro_batch_size): - prefix_indices.append([]) - # Prefix lm per document. if reset_attention_mask: + prefix_indices.append([]) + # Compute the index of all eod tokens in data. eod_indices = (data[batch_id] == eod_token).nonzero().squeeze(-1) @@ -356,6 +356,7 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_ assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], int), \ f"Per document prefix has to store an int for each row, got {partial_prefix_indices[batch_id]}" + prefix_index: int if partial_prefix_indices is None or partial_prefix_indices[batch_id] is None: # We need to randomly generate a prefix index prefix_index = randint(0, seq_length - 1) @@ -363,5 +364,6 @@ def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_ # We get value from partial_prefix_indices, and run validation on that value prefix_index = partial_prefix_indices[batch_id] assert 0 <= prefix_index < seq_length - 1, f"Prefix index needs to be between documents indices, 0 <= {prefix_index} < {seq_length - 1} should be True." - prefix_indices[batch_id].append(prefix_index) + prefix_indices.append(prefix_index) + return prefix_indices From fe4a81592b714468308ac108f5f48425544c6cc8 Mon Sep 17 00:00:00 2001 From: thomasw21 <24695242+thomasw21@users.noreply.github.com> Date: Thu, 16 Sep 2021 13:44:10 +0200 Subject: [PATCH 30/30] Fix test --- tests/test_model.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 7c6357227..3079a429a 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -241,15 +241,7 @@ def test_prefix_lm_wo_reset_attention_mask(self): model = model[0] token_ids = torch.randint(args.padded_vocab_size, (args.micro_batch_size, args.seq_length)) - - # process batch to have non empty prefix - for i in range(9, -1, -1): - input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) - if (prefix_indices[0][0] != 0): - break - if i == 0: - # FIXME: find a better way to not obtain empty prefix - raise ValueError("Could not obtain non pathological case where prefix is not empty") + input_batch, _, prefix_indices = get_prefix_lm_batch_pipe({"text": token_ids}) model(*input_batch)