Skip to content

Commit

Permalink
Merge branch 'megatron-bart' of github.com:michalivne/NeMo into megat…
Browse files Browse the repository at this point in the history
…ron-bart
  • Loading branch information
michalivne committed Mar 30, 2022
2 parents 40ad596 + 0b08d2a commit 6c6b7fd
Show file tree
Hide file tree
Showing 46 changed files with 382 additions and 468 deletions.
56 changes: 28 additions & 28 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2001,34 +2001,34 @@ pipeline {
sh "rm -rf examples/nlp/language_modeling/bert_index_mappings"
}
}
stage('L2: Megatron P-Tuning GPT LM') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python examples/nlp/text_classification/ptune_text_classification.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.max_epochs=1 \
+trainer.limit_val_batches=10 \
+trainer.limit_train_batches=10 \
+trainer.limit_test_batches=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/ptune_results \
model.tokenizer.vocab_file=/home/TestData/nlp/ptune/gpt2-vocab.json \
model.tensor_model_parallel_size=2 \
model.tokenizer.merge_file=/home/TestData/nlp/ptune/gpt2-merges.txt \
model.language_model.nemo_file=/home/TestData/nlp/ptune/small_gpt.nemo \
model.dataset.classes=[positive,neutral,negative] \
model.train_ds.file_path=/home/TestData/nlp/ptune/data/train_0.txt \
model.validation_ds.file_path=/home/TestData/nlp/ptune/data/validation_0.txt \
model.test_ds.file_path=/home/TestData/nlp/ptune/data/test_0.txt "
sh "rm -rf examples/nlp/language_modeling/ptune_results"
}
}
// stage('L2: Megatron P-Tuning GPT LM') {
// when {
// anyOf {
// branch 'main'
// changeRequest target: 'main'
// }
// }
// failFast true
// steps {
// sh "python examples/nlp/text_classification/ptune_text_classification.py \
// trainer.devices=2 \
// trainer.accelerator=gpu \
// trainer.max_epochs=1 \
// +trainer.limit_val_batches=10 \
// +trainer.limit_train_batches=10 \
// +trainer.limit_test_batches=10 \
// exp_manager.exp_dir=examples/nlp/language_modeling/ptune_results \
// model.tokenizer.vocab_file=/home/TestData/nlp/ptune/gpt2-vocab.json \
// model.tensor_model_parallel_size=2 \
// model.tokenizer.merge_file=/home/TestData/nlp/ptune/gpt2-merges.txt \
// model.language_model.nemo_file=/home/TestData/nlp/ptune/small_gpt.nemo \
// model.dataset.classes=[positive,neutral,negative] \
// model.train_ds.file_path=/home/TestData/nlp/ptune/data/train_0.txt \
// model.validation_ds.file_path=/home/TestData/nlp/ptune/data/validation_0.txt \
// model.test_ds.file_path=/home/TestData/nlp/ptune/data/test_0.txt "
// sh "rm -rf examples/nlp/language_modeling/ptune_results"
// }
// }
stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
when {
anyOf {
Expand Down
4 changes: 2 additions & 2 deletions docs/source/core/core.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ Creating a NeMo model is similar to any other PyTorch workflow. We start by init
# instantiate a BERT based encoder
self.bert_model = get_lm_model(
pretrained_model_name=cfg.language_model.pretrained_model_name,
config_file=cfg.language_model.config_file,
config_dict=cfg.language_model.config,
checkpoint_file=cfg.language_model.lm_checkpoint,
vocab_file=cfg.tokenizer.vocab_file,
trainer=trainer,
cfg=cfg,
)
# instantiate the FFN for classification
Expand Down
58 changes: 0 additions & 58 deletions docs/source/nemo_text_processing/textprocessing_all.bib

This file was deleted.

4 changes: 0 additions & 4 deletions docs/source/nlp/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,6 @@ Modules
:show-inheritance:
:members:

.. autoclass:: nemo.collections.nlp.modules.common.megatron.MegatronBertEncoder
:show-inheritance:
:members:

.. autoclass:: nemo.collections.nlp.modules.AlbertEncoder
:show-inheritance:
:members:
Expand Down
6 changes: 3 additions & 3 deletions docs/source/nlp/text_normalization/nn_text_normalization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ In the example, ``self`` denotes that the spoken form is the same as the written
<eos> <eos>
More information about the Google text normalization dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-Sproat2016RNNAT`.
More information about the Google text normalization dataset can be found in the paper `RNN Approaches to Text Normalization: A Challenge <https://arxiv.org/ftp/arxiv/papers/1611/1611.00068.pdf>`__ :cite:`nlp-textnorm-sproat2016rnn`.
The script for splitting the Google text normalization data files into `train`, `dev`, `test` can be found here:
`data/data_split.py <https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/duplex_text_normalization/data/data_split.py>`__.

Expand Down Expand Up @@ -228,7 +228,7 @@ This pipeline consists of
* adding space around `-` in alpha-numerical words, e.g. `2-car` -> `2 - car`
* converting unicode fraction e.g. ½ to 1/2
* normalizing greek letters and some special characters, e.g. `+` -> `plus`
* Moses :cite:`koehnetal2007moses`. tokenization/preprocessing of the input
* Moses :cite:`nlp-textnorm-koehn-etal-2007-moses`. tokenization/preprocessing of the input
* inference with neural tagger and decoder
* Moses postprocessing/ detokenization
* WFST-based grammars to verbalize some `VERBATIM`
Expand All @@ -245,7 +245,7 @@ transform them into the appropriate forms (e.g., spoken forms for TN and written
The decoder model is essentially a Transformer-based encoder-decoder seq2seq model (e.g., the example
training script uses the T5-base model by default). Overall, our design is partly inspired by the
RNN-based sliding window model proposed in the paper
`Neural Models of Text Normalization for Speech Applications <https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf>`__ :cite:`nlp-textnorm-Zhang2019NeuralMO`.
`Neural Models of Text Normalization for Speech Applications <https://research.fb.com/wp-content/uploads/2019/03/Neural-Models-of-Text-Normalization-for-Speech-Applications.pdf>`__ :cite:`nlp-textnorm-zhang2019neural`.

We introduce a simple but effective technique to allow our model to be duplex. Depending on the
task the model is handling, we append the appropriate prefix to the input. For example, suppose
Expand Down
34 changes: 34 additions & 0 deletions docs/source/nlp/text_normalization/tn_itn_all.bib
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,38 @@ @incollection{mohri2009weighted
pages={213--254},
year={2009},
publisher={Springer}
}


@article{zhang2019neural,
title={Neural models of text normalization for speech applications},
author={Zhang, Hao and Sproat, Richard and Ng, Axel H and Stahlberg, Felix and Peng, Xiaochang and Gorman, Kyle and Roark, Brian},
journal={Computational Linguistics},
year={2019},
}


@inproceedings{koehn-etal-2007-moses,
title = "{M}oses: Open Source Toolkit for Statistical Machine Translation",
author = "Koehn, Philipp and
Hoang, Hieu and
Birch, Alexandra and
Callison-Burch, Chris and
Federico, Marcello and
Bertoldi, Nicola and
Cowan, Brooke and
Shen, Wade and
Moran, Christine and
Zens, Richard and
Dyer, Chris and
Bojar, Ond{\v{r}}ej and
Constantin, Alexandra and
Herbst, Evan",
booktitle = "Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions",
month = jun,
year = "2007",
address = "Prague, Czech Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P07-2045",
pages = "177--180",
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ model:
lm_checkpoint: null
config_file: null # json file, precedence over config
config: null
nemo_file: null

classifier_head:
num_output_layers: 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ model:
lm_checkpoint: null
config_file: null # json file, precedence over config
config: null
nemo_file: null


head:
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/modules/rnnt.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def predict(
Args:
y: Optional torch tensor of shape [B, U] of dtype long which will be passed to the Embedding.
If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on Embedding.
If None, creates a zero tensor of shape [B, 1, H] which mimics output of pad-token on EmbeddiNg.
state: An optional list of states for the RNN. Eg: For LSTM, it is the state list length is 2.
Each state must be a tensor of shape [L, B, H].
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,7 @@ def __init__(
self.cfg = cfg
self.data_prepared = False

self.setup_tokenizer(cfg.tokenizer)
super().__init__(cfg=cfg, trainer=trainer)
super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)

if self.cfg.library == "huggingface":
self.language_model = AutoModelWithLMHead.from_pretrained(cfg.language_model.pretrained_model_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from nemo.collections.nlp.losses import SGDDialogueStateLoss
from nemo.collections.nlp.models.nlp_model import NLPModel
from nemo.collections.nlp.modules import SGDDecoder, SGDEncoder
from nemo.collections.nlp.modules.common.lm_utils import get_lm_model
from nemo.collections.nlp.parts.utils_funcs import tensor2list
from nemo.core.classes.common import PretrainedModelInfo, typecheck
from nemo.core.neural_types import NeuralType
Expand All @@ -52,32 +51,18 @@ class SGDQAModel(NLPModel):
"""Dialogue State Tracking Model SGD-QA"""

@property
def input_types(self) -> Optional[Dict[str, NeuralType]]:
return self.bert_model.input_types

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
return self.decoder.output_types
def output_module(self):
return self.decoder

def __init__(self, cfg: DictConfig, trainer: Trainer = None):

self.data_prepared = False
self.setup_tokenizer(cfg.tokenizer)
super().__init__(cfg=cfg, trainer=trainer)
self.bert_model = get_lm_model(
pretrained_model_name=cfg.language_model.pretrained_model_name,
config_file=self.register_artifact('language_model.config_file', cfg.language_model.config_file),
config_dict=OmegaConf.to_container(cfg.language_model.config) if cfg.language_model.config else None,
checkpoint_file=cfg.language_model.lm_checkpoint,
vocab_file=self.register_artifact('tokenizer.vocab_file', cfg.tokenizer.vocab_file),
)

self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout)
self.decoder = SGDDecoder(embedding_dim=self.bert_model.config.hidden_size)
self.loss = SGDDialogueStateLoss(reduction="mean")

@typecheck()
def forward(self, input_ids, token_type_ids, attention_mask):
def forward(self, input_ids, attention_mask, token_type_ids):
token_embeddings = self.bert_model(
input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,11 +71,11 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
if trainer is not None:
self.world_size = trainer.num_nodes * trainer.num_gpus

self._tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer)
self.tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer)

super().__init__(cfg=cfg, trainer=trainer)
super().__init__(cfg=cfg, trainer=trainer, no_lm_init=True)
self.model = AutoModelForSeq2SeqLM.from_pretrained(cfg.transformer)
self.max_sequence_len = cfg.get('max_sequence_len', self._tokenizer.model_max_length)
self.max_sequence_len = cfg.get('max_sequence_len', self.tokenizer.model_max_length)
self.mode = cfg.get('mode', 'joint')

self.transformer_name = cfg.transformer
Expand Down Expand Up @@ -103,7 +103,7 @@ def setup_cgs(self, cfg: DictConfig):
self.neural_confidence_threshold = cfg.get('neural_confidence_threshold', 0.99)
self.n_tagged = cfg.get('n_tagged', 1)
input_case = 'cased' # input_case is cased by default
if hasattr(self._tokenizer, 'do_lower_case') and self._tokenizer.do_lower_case:
if hasattr(self.tokenizer, 'do_lower_case') and self.tokenizer.do_lower_case:
input_case = 'lower_cased'
if not PYNINI_AVAILABLE:
raise Exception(
Expand Down Expand Up @@ -158,7 +158,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0, split="val"):
labels=batch['labels'],
)

labels_str = self._tokenizer.batch_decode(
labels_str = self.tokenizer.batch_decode(
torch.ones_like(batch['labels']) * ((batch['labels'] == -100) * 100) + batch['labels'],
skip_special_tokens=True,
)
Expand Down Expand Up @@ -291,7 +291,7 @@ def _generate_predictions(self, input_ids: torch.Tensor, model_max_len: int = 51
)

generated_ids, sequence_toks_scores = outputs['sequences'], outputs['scores']
generated_texts = self._tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
generated_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

return generated_texts, generated_ids, sequence_toks_scores

Expand Down Expand Up @@ -319,7 +319,7 @@ def _infer(

if sum(nb_spans) == 0:
return [[]] * len(sents)
model, tokenizer = self.model, self._tokenizer
model, tokenizer = self.model, self.tokenizer
ctx_size = constants.DECODE_CTX_SIZE
extra_id_0 = constants.EXTRA_ID_0
extra_id_1 = constants.EXTRA_ID_1
Expand Down Expand Up @@ -370,7 +370,7 @@ def _infer(
# Compute selected_toks_probs
selected_toks_probs = []
for jx, _id in enumerate(cur_generated_ids):
if _id != self._tokenizer.pad_token_id:
if _id != self.tokenizer.pad_token_id:
selected_toks_probs.append(cur_toks_probs[jx, _id])
else:
selected_toks_probs.append(1)
Expand Down Expand Up @@ -481,7 +481,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):

dataset = TextNormalizationDecoderDataset(
input_file=input_file,
tokenizer=self._tokenizer,
tokenizer=self.tokenizer,
tokenizer_name=self.transformer_name,
mode=self.mode,
max_len=self.max_sequence_len,
Expand All @@ -504,7 +504,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):
self._val_id_to_class.append({v: k for k, v in dataset.label_ids_semiotic.items()})

data_collator = DataCollatorForSeq2Seq(
self._tokenizer, model=self.model, label_pad_token_id=constants.LABEL_PAD_TOKEN_ID, padding=True
self.tokenizer, model=self.model, label_pad_token_id=constants.LABEL_PAD_TOKEN_ID, padding=True
)
dl = torch.utils.data.DataLoader(
dataset=dataset,
Expand Down
Loading

0 comments on commit 6c6b7fd

Please sign in to comment.