bigscience-workshop · stas00 · Mar 10, 2021 · Mar 11, 2021 · Mar 19, 2021 · Mar 19, 2021
diff --git a/README.md b/README.md
@@ -113,6 +113,11 @@ python tools/preprocess_data.py \
 
 The output will be two files named, in this case, `my-bert_text_sentence.bin` and `my-bert_text_sentence.idx`. The `--data-path` specified in later BERT training is the full path and new filename, but without the file extension.
 
+For T5 use the same preprocessing as BERT, perhaps renaming it to:
+<pre>
+       --output-prefix my-t5 \
+</pre>
+
 Some minor modifications are required for GPT data preprocessing, namely, the addition of a merge table, an end-of-document token, removal of sentence splitting, and a change to the tokenizer type:
 <pre>
 python tools/preprocess_data.py \
@@ -247,13 +252,14 @@ T5_ARGS="--num-layers 24 \
          --micro-batch-size 16 \
          --global-batch-size 2048 \
          --vocab-file $VOCAB_FILE \
+         --vocab-extra-ids 100 \
          --split 949,50,1 \
          --fp16"
 
 OUTPUT_ARGS=&#60;same as those in <a href="#bert-pretraining">BERT pretraining</a> above&#62;
 
 python pretrain_t5.py \
-       $BERT_ARGS \
+       $T5_ARGS \
        $OUTPUT_ARGS \
        --save $CHECKPOINT_PATH \
        --load $CHECKPOINT_PATH \

diff --git a/examples/create_embeddings.sh b/examples/create_embeddings.sh
diff --git a/examples/evaluate_ict_zeroshot_nq.sh → examples/evaluate_retriever_nq.sh b/examples/evaluate_ict_zeroshot_nq.sh → examples/evaluate_retriever_nq.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
 
 # Evaluate natural question test data given Wikipedia embeddings and pretrained
-# ICT model
+# ICT model or a finetuned model for Natural Question task
 
 # Datasets can be downloaded from the following link:
 # https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
 
 EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
 EMBEDDING_PATH=<Specify path of the embeddings>
-CHECKPOINT_PATH=<Specify path of pretrained ICT model>
+CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
 
-QA_FILE=<Path of the natural question test dataset>
+QA_FILE=<Path of the natural question dev or test dataset>
 
 python tasks/main.py \
-    --task ICT-ZEROSHOT-NQ \
+    --task RETRIEVER-EVAL \
     --tokenizer-type BertWordPieceLowerCase \
     --num-layers 12 \
     --hidden-size 768 \
@@ -29,8 +29,10 @@ python tasks/main.py \
     --retriever-seq-length 256 \
     --vocab-file  bert-vocab.txt\
     --qa-data-test ${QA_FILE} \
-    --num-workers 2 \
     --faiss-use-gpu \
     --retriever-report-topk-accuracies 1 5 20 100 \
-    --fp16
+    --fp16 \
+    --indexer-log-interval 1000 \
+    --indexer-batch-size 128
+
 
diff --git a/examples/finetune_retriever_distributed.sh b/examples/finetune_retriever_distributed.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Finetune a BERT or pretrained ICT model using Google natural question data 
+# Datasets can be downloaded from the following link:
+# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
+
+WORLD_SIZE=8
+
+DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
+
+# Load either of the below
+BERT_LOAD_PATH=<Path of BERT pretrained model>
+PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
+        --task RET-FINETUNE-NQ \
+        --train-with-neg \
+        --train-hard-neg 1 \
+        --pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
+        --num-layers 12 \
+        --hidden-size 768 \
+        --num-attention-heads 12 \
+        --tensor-model-parallel-size 1 \
+        --tokenizer-type BertWordPieceLowerCase \
+        --train-data nq-train.json \
+        --valid-data nq-dev.json \
+        --save ${CHECKPOINT_PATH} \
+        --load ${CHECKPOINT_PATH} \
+        --vocab-file bert-vocab.txt \
+        --bert-load ${BERT_LOAD_PATH} \
+        --save-interval 5000 \
+        --log-interval 10 \
+        --eval-interval 20000 \
+        --eval-iters 100 \
+        --indexer-log-interval 1000 \
+        --faiss-use-gpu \
+        --DDP-impl torch \
+        --fp16 \
+        --retriever-report-topk-accuracies 1 5 10 20 100 \
+        --seq-length 512 \
+        --retriever-seq-length 256 \
+        --max-position-embeddings 512 \
+        --retriever-score-scaling \
+        --epochs 80 \
+        --micro-batch-size 8 \
+        --eval-micro-batch-size 16 \
+        --indexer-batch-size 128 \
+        --lr 2e-5 \
+        --lr-warmup-fraction 0.01 \
+        --weight-decay 1e-1
diff --git a/examples/pretrain_t5.sh b/examples/pretrain_t5.sh
@@ -15,7 +15,7 @@ python pretrain_t5.py \
        --encoder-seq-length 512 \
        --decoder-seq-length 128 \
        --micro-batch-size 16 \
-       --global-batch-size 2048 \
+       --global-batch-size 16 \
        --max-position-embeddings 512 \
        --train-iters 1000000 \
        --lr-decay-iters 1000000 \
@@ -35,4 +35,5 @@ python pretrain_t5.py \
        --save-interval 10000 \
        --eval-interval 1000 \
        --eval-iters 10 \
-       --fp16
+       --fp16 \
+       --vocab-extra-ids 100
diff --git a/examples/pretrain_t5_distributed.sh b/examples/pretrain_t5_distributed.sh
@@ -24,7 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --encoder-seq-length 512 \
        --decoder-seq-length 128 \
        --micro-batch-size 16 \
-       --global-batch-size 2048 \
+       --global-batch-size 128 \
        --max-position-embeddings 512 \
        --train-iters 1000000 \
        --lr-decay-iters 1000000 \
@@ -44,4 +44,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --save-interval 10000 \
        --eval-interval 1000 \
        --eval-iters 10 \
-       --fp16
+       --fp16 \
+       --vocab-extra-ids 100
diff --git a/examples/pretrain_t5_distributed_with_mp.sh b/examples/pretrain_t5_distributed_with_mp.sh
@@ -24,8 +24,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --encoder-seq-length 512 \
        --decoder-seq-length 128 \
        --micro-batch-size 16 \
-       --global-batch-size 2048 \
-       --seq-length 512 \
+       --global-batch-size 128 \
        --max-position-embeddings 512 \
        --train-iters 1000000 \
        --lr-decay-iters 1000000 \
@@ -45,4 +44,5 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
        --save-interval 10000 \
        --eval-interval 1000 \
        --eval-iters 10 \
-       --fp16
+       --fp16  \
+       --vocab-extra-ids 100
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -440,6 +440,8 @@ def _add_training_args(parser):
                        help='Run optimizer on CPU')
     group.add_argument('--cpu_torch_adam', action='store_true',
                        help='Use Torch Adam as optimizer on CPU.')
+    group.add_argument('--codecarbon-dir', type=str, default=None,
+                       help='Write CodeCarbon logs to this directory.')
 
     return parser
 

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -60,8 +60,8 @@ def _compare(arg_name, old_arg_name=None):
     _compare('num_layers')
     _compare('hidden_size')
     _compare('num_attention_heads')
-    _compare('max_position_embeddings')
     if args.vocab_file:
+        _compare('max_position_embeddings')
         _compare('make_vocab_size_divisible_by')
         _compare('padded_vocab_size')
         _compare('tokenizer_type')

diff --git a/megatron/indexer.py b/megatron/indexer.py
@@ -1,15 +1,16 @@
 import sys
+import time
 import torch
 import torch.distributed as dist
 
-from megatron import get_args
+from megatron import get_args, print_rank_0
 from megatron import mpu
 from megatron.checkpointing import load_biencoder_checkpoint
 from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset
 from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch
 from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader
 from megatron.data.realm_index import detach, OpenRetreivalDataStore
-from megatron.model.biencoder_model import biencoder_model_provider
+from megatron.model.biencoder_model import get_model_provider
 from megatron.training import get_model
 
 
@@ -29,7 +30,6 @@ def __init__(self):
         # need to know whether we're using a REALM checkpoint (args.load)
         # or ICT checkpoint
         assert not (args.load and args.ict_load)
-        #self.using_realm_chkpt = args.ict_load is None
 
         self.log_interval = args.indexer_log_interval
         self.batch_size = args.indexer_batch_size
@@ -47,8 +47,8 @@ def load_attributes(self):
         if self.biencoder_shared_query_context_model:
             only_context_model = False
 
-        model = get_model(lambda: biencoder_model_provider(only_context_model \
-            = only_context_model, biencoder_shared_query_context_model = \
+        model = get_model(get_model_provider(only_context_model=\
+            only_context_model, biencoder_shared_query_context_model=\
             self.biencoder_shared_query_context_model))
 
         self.model = load_biencoder_checkpoint(model,
@@ -85,6 +85,7 @@ def build_and_save_index(self):
         """
         assert len(self.model) == 1
         unwrapped_model = self.model[0]
+
         while not hasattr(unwrapped_model, 'embed_text'):
             unwrapped_model = unwrapped_model.module
 
@@ -103,6 +104,7 @@ def build_and_save_index(self):
             context_logits = unwrapped_model.embed_text(
                 unwrapped_model.context_model, context_tokens, context_mask,
                 context_types)
+
             context_logits = detach(context_logits)
             row_id = detach(row_id)
 

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
@@ -87,7 +87,7 @@ def get_lr(self):
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.decay_style))
-       
+
         return self.min_lr + coeff * delta_lr
-Original file line number
+Diff line change
@@ Expand Up / @@ -87,7 +87,7 @@ def get_lr(self): @@
             else:
                 raise Exception('{} decay style is not supported.'.format(
                     self.decay_style))
             return self.min_lr + coeff * delta_lr
@@ Expand Down @@