diff --git a/docs/source/asr/data/benchmark_en.csv b/docs/source/asr/data/benchmark_en.csv index 5f68e9ca22ce..5c764ba38651 100644 --- a/docs/source/asr/data/benchmark_en.csv +++ b/docs/source/asr/data/benchmark_en.csv @@ -28,4 +28,7 @@ stt_en_conformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/ca stt_en_conformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_conformer_transducer_xxlarge" stt_en_fastconformer_transducer_large,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_large" stt_en_fastconformer_ctc_large,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_large" -stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" \ No newline at end of file +stt_en_fastconformer_hybrid_large_pc,EncDecHybridRNNTCTCBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_hybrid_large_pc" +stt_en_fastconformer_transducer_xlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge" +stt_en_fastconformer_ctc_xlarge,EncDecCTCModelBPE,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge" +stt_en_fastconformer_transducer_xxlarge,EncDecRNNTBPEModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge" \ No newline at end of file diff --git a/docs/source/asr/speaker_recognition/data/speaker_results.csv b/docs/source/asr/speaker_recognition/data/speaker_results.csv index a0e865c9c487..c92c971e4939 100644 --- a/docs/source/asr/speaker_recognition/data/speaker_results.csv +++ b/docs/source/asr/speaker_recognition/data/speaker_results.csv @@ -1,4 +1,5 @@ Model Name,Model Base Class,Model Card titanet_large,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_large" +titanet_small,EncDecSpeakerLabelModel,"https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/titanet_small" speakerverification_speakernet,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:speakerverification_speakernet" ecapa_tdnn,EncDecSpeakerLabelModel,"https://ngc.nvidia.com/catalog/models/nvidia:nemo:ecapa_tdnn" \ No newline at end of file diff --git a/docs/source/nlp/nlp_all.bib b/docs/source/nlp/nlp_all.bib index fd0f15f6d1da..48a53240e52b 100644 --- a/docs/source/nlp/nlp_all.bib +++ b/docs/source/nlp/nlp_all.bib @@ -216,3 +216,12 @@ @article{jegou2022faiss pages={ascl--2210}, year={2022} } + +@misc{antonova2023spellmapper, + title={SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings}, + author={Alexandra Antonova and Evelina Bakhturina and Boris Ginsburg}, + year={2023}, + eprint={2306.02317}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} diff --git a/docs/source/nlp/spellchecking_asr_customization.rst b/docs/source/nlp/spellchecking_asr_customization.rst index f9009b520361..c6666c4e338c 100644 --- a/docs/source/nlp/spellchecking_asr_customization.rst +++ b/docs/source/nlp/spellchecking_asr_customization.rst @@ -3,7 +3,7 @@ SpellMapper (Spellchecking ASR Customization) Model ===================================================== -SpellMapper is a non-autoregressive model for postprocessing of ASR output. It gets as input a single ASR hypothesis (text) and a custom vocabulary and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Unlike traditional spellchecking approaches, which aim to correct known words using language models, SpellMapper's goal is to correct highly specific user terms, out-of-vocabulary (OOV) words or spelling variations (e.g., "John Koehn", "Jon Cohen"). +`SpellMapper `__ :cite:`nlp-ner-antonova2023spellmapper` is a non-autoregressive model for postprocessing of ASR output. It gets as input a single ASR hypothesis (text) and a custom vocabulary and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Unlike traditional spellchecking approaches, which aim to correct known words using language models, SpellMapper's goal is to correct highly specific user terms, out-of-vocabulary (OOV) words or spelling variations (e.g., "John Koehn", "Jon Cohen"). This model is an alternative to word boosting/shallow fusion approaches: diff --git a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml index 8c7561381299..41a8abd93758 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_ctc_bpe.yaml @@ -17,6 +17,22 @@ # | bf16 | 32GB | 64 | # | | 80GB | 128 | # +-----------+------------+------------+ +# Here are the recommended configs for different variants of FastConformer-CTC-BPE, other parameters are the same as in this config file. +# +# +--------------+---------+---------+----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Model | d_model | n_heads | n_layers |conv_kernel_size| weight_decay | pred_hidden/joint_hidden | pred_rnn_layers | xscaling | +# +==============+=========+========+===========+================+==============+==========================+=================+============+ +# | Small (14M) | 176 | 4 | 16 | 9 | 0.0 | 320 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Medium (32M) | 256 | 4 | 16 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Large (120M) | 512 | 8 | 17 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XLarge (616M)| 1024 | 8 | 24 | 9 | 1e-3 | 640 | 2 | False | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XXLarge(1.2B)| 1024 | 8 | 42 | 5 | 1e-3 | 640 | 2 | False | +# +--------------------------------------------------------------+--------------+--------------------------+-----------------+------------+ + # Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. # Default learning parameters in this config are set for global batch size of 2K while you may use lower values. diff --git a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml index 0b0ec78e077d..9e3da8d3545f 100644 --- a/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml +++ b/examples/asr/conf/fastconformer/fast-conformer_transducer_bpe.yaml @@ -17,6 +17,22 @@ # | bf16 | 32GB | 64 | # | | 80GB | 128 | # +-----------+------------+------------+ +# Here are the recommended configs for different variants of FastConformer-Transducer-BPE, other parameters are the same as in this config file. +# +# +--------------+---------+---------+----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Model | d_model | n_heads | n_layers |conv_kernel_size| weight_decay | pred_hidden/joint_hidden | pred_rnn_layers | xscaling | +# +==============+=========+========+===========+================+==============+==========================+=================+============+ +# | Small (14M) | 176 | 4 | 16 | 9 | 0.0 | 320 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Medium (32M) | 256 | 4 | 16 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | Large (120M) | 512 | 8 | 17 | 9 | 1e-3 | 640 | 1 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XLarge (616M)| 1024 | 8 | 24 | 9 | 1e-3 | 640 | 2 | True | +# +--------------+---------+--------+-----------+----------------+--------------+--------------------------+-----------------+------------+ +# | XXLarge(1.2B)| 1024 | 8 | 42 | 5 | 1e-3 | 640 | 2 | False | +# +--------------------------------------------------------------+--------------+--------------------------+-----------------+------------+ + # Note: They are based on the assumption of max_duration of 20. If you have longer or shorter max_duration, then batch sizes may need to get updated accordingly. # Default learning parameters in this config are set for global batch size of 2K while you may use lower values. diff --git a/examples/nlp/spellchecking_asr_customization/README.md b/examples/nlp/spellchecking_asr_customization/README.md index 2d83fd8d11ad..9d2063eff181 100644 --- a/examples/nlp/spellchecking_asr_customization/README.md +++ b/examples/nlp/spellchecking_asr_customization/README.md @@ -1,6 +1,6 @@ # SpellMapper - spellchecking model for ASR Customization - -This model is inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf, but does not repeat its implementation. +Paper: https://arxiv.org/abs/2306.02317 +This model was partly inspired by Microsoft's paper https://arxiv.org/pdf/2203.00888.pdf. The goal is to build a model that gets as input a single ASR hypothesis (text) and a vocabulary of custom words/phrases and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any. Our model is non-autoregressive (NAR) based on transformer architecture (BERT with multiple separators). diff --git a/nemo/collections/asr/models/ctc_bpe_models.py b/nemo/collections/asr/models/ctc_bpe_models.py index a74c7f3de5c2..7d3b236b2bab 100644 --- a/nemo/collections/asr/models/ctc_bpe_models.py +++ b/nemo/collections/asr/models/ctc_bpe_models.py @@ -606,4 +606,11 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_ctc_xlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_ctc_xlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_ctc_xlarge/versions/1.20.0/files/stt_en_fastconformer_ctc_xlarge.nemo", + ) + results.append(model) + return results diff --git a/nemo/collections/asr/models/label_models.py b/nemo/collections/asr/models/label_models.py index aefa8743826b..cc789dacff11 100644 --- a/nemo/collections/asr/models/label_models.py +++ b/nemo/collections/asr/models/label_models.py @@ -92,6 +92,13 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) result.append(model) + model = PretrainedModelInfo( + pretrained_model_name="titanet_small", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:titanet_small", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/titanet_small/versions/1.19.0/files/titanet-s.nemo", + ) + result.append(model) + return result def __init__(self, cfg: DictConfig, trainer: Trainer = None): diff --git a/nemo/collections/asr/models/rnnt_bpe_models.py b/nemo/collections/asr/models/rnnt_bpe_models.py index 6fed8be9d410..9ed38a376103 100644 --- a/nemo/collections/asr/models/rnnt_bpe_models.py +++ b/nemo/collections/asr/models/rnnt_bpe_models.py @@ -253,6 +253,20 @@ def list_available_models(cls) -> List[PretrainedModelInfo]: ) results.append(model) + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_transducer_xlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xlarge/versions/1.20.1/files/stt_en_fastconformer_transducer_xlarge.nemo", + ) + results.append(model) + + model = PretrainedModelInfo( + pretrained_model_name="stt_en_fastconformer_transducer_xxlarge", + description="For details about this model, please visit https://ngc.nvidia.com/catalog/models/nvidia:nemo:stt_en_fastconformer_transducer_xxlarge", + location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/stt_en_fastconformer_transducer_xxlarge/versions/1.20.0/files/stt_en_fastconformer_transducer_xxlarge.nemo", + ) + results.append(model) + return results def __init__(self, cfg: DictConfig, trainer: Trainer = None): diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py index fc889de2dc63..15ffb2dd1bcd 100644 --- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -43,6 +43,7 @@ @experimental class SpellcheckingAsrCustomizationModel(NLPModel): """ + https://arxiv.org/abs/2306.02317 BERT-based model for Spellchecking ASR Customization. It takes as input ASR hypothesis and candidate customization entries. It labels the hypothesis with correct entry index or 0. diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb index e11025aeb1d3..cc949ad699b3 100644 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb @@ -1,13 +1,14 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "metadata": { "id": "PiRuohn_FQco" }, "source": [ "# Overview\n", - "This tutorial demonstrates how to run inference with SpellMapper - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", + "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", "\n", "Estimated time: 10-15 min.\n", "\n", @@ -957,25 +958,25 @@ }, { "cell_type": "markdown", - "source": [ - "Free GPU memory to avoid OOM." - ], "metadata": { "id": "bt2TMLLvdUHm" - } + }, + "source": [ + "Free GPU memory to avoid OOM." + ] }, { "cell_type": "code", - "source": [ - "del spectrogram_generator\n", - "del vocoder\n", - "torch.cuda.empty_cache()" - ], + "execution_count": null, "metadata": { "id": "ZwEpAOCaRH7s" }, "outputs": [], - "execution_count": null + "source": [ + "del spectrogram_generator\n", + "del vocoder\n", + "torch.cuda.empty_cache()" + ] }, { "cell_type": "markdown", @@ -1363,22 +1364,27 @@ ] }, { + "attachments": {}, "cell_type": "markdown", "metadata": { "id": "upvTbkFAeYtR" }, "source": [ "# Final notes\n", - "1. Our paper...\n", + "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n", "\n", - "2. To reproduce evaluation experiments from this paper see these scripts:\n", + "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n", + "\n", + "3. To reproduce evaluation experiments from this paper see these scripts:\n", " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", "\n", - "3. To reproduce training see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", + "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", + "\n", + "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n", "\n", - "4. Promising future research directions would be:\n", + "6. Promising future research directions would be:\n", " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", " - retrain with adding more various false positives to the training data" ] @@ -1387,9 +1393,9 @@ "metadata": { "accelerator": "GPU", "colab": { - "toc_visible": true, + "gpuType": "T4", "provenance": [], - "gpuType": "T4" + "toc_visible": true }, "kernelspec": { "display_name": "Python 3", @@ -1401,4 +1407,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +}