ASR Refactoring (#2240)

* Refactor out the preprocessing from ASR into common Signed-off-by: smajumdar <[email protected]> * Correct nltk issue with vocabs.py for clusters Signed-off-by: smajumdar <[email protected]> * Add typing information to SpecAugment and SpecCutout Signed-off-by: smajumdar <[email protected]> * Reorganize parts directory Signed-off-by: smajumdar <[email protected]> * Refactor parts submodules, add __init__ to few important parts Signed-off-by: smajumdar <[email protected]> * Update docs for new path to parts Signed-off-by: smajumdar <[email protected]> * Cherry pick PR #2219 Signed-off-by: smajumdar <[email protected]> * Add header for preprocessing commons Signed-off-by: smajumdar <[email protected]> * Fix style of tests Signed-off-by: smajumdar <[email protected]> * Add forced update of configs for train-val-test ds to new labels tests Signed-off-by: smajumdar <[email protected]> * Update path to FilterbankFeatures for TTS Signed-off-by: smajumdar <[email protected]> * Add an alias file for backward compatibility Signed-off-by: smajumdar <[email protected]> * Add an alias file for backward compatibility Signed-off-by: smajumdar <[email protected]> * Update training scripts of ASR to support finetuning Signed-off-by: smajumdar <[email protected]> * Update Finetuning step to be ModelPT level Signed-off-by: smajumdar <[email protected]> * Update docs for finetuning for ASR Signed-off-by: smajumdar <[email protected]> * Fix style Signed-off-by: smajumdar <[email protected]> * Update docs and scripts with fine-tuning info Signed-off-by: smajumdar <[email protected]> * Update docs and scripts with fine-tuning info Signed-off-by: smajumdar <[email protected]> * Fix style Signed-off-by: smajumdar <[email protected]> * Update scripts Signed-off-by: smajumdar <[email protected]> * Add comment for weight initialization Signed-off-by: smajumdar <[email protected]>
NVIDIA · May 26, 2021 · bee43e8 · bee43e8
1 parent b94c7a1
commit bee43e8
Show file tree

Hide file tree

Showing 94 changed files with 1,082 additions and 654 deletions.
diff --git a/docs/source/asr/api.rst b/docs/source/asr/api.rst
@@ -62,19 +62,19 @@ Modules
 Parts
 -----
 
-.. autoclass:: nemo.collections.asr.parts.jasper.JasperBlock
+.. autoclass:: nemo.collections.asr.parts.submodules.jasper.JasperBlock
     :show-inheritance:
     :members:
 
 
 Mixins
 ------
 
-.. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRBPEMixin
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
     :show-inheritance:
     :members:
 
@@ -129,39 +129,39 @@ Audio Augmentors
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.SpeedPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.SpeedPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.TimeStretchPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TimeStretchPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.GainPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.GainPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.ImpulsePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ImpulsePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.ShiftPerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.ShiftPerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.NoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.NoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.WhiteNoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.WhiteNoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.RirAndNoisePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.RirAndNoisePerturbation
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.perturb.TranscodePerturbation
+.. autoclass:: nemo.collections.asr.parts.preprocessing.perturb.TranscodePerturbation
     :show-inheritance:
     :members:
 
@@ -179,25 +179,25 @@ RNNT Decoding
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_greedy_decoding.GreedyRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyRNNTInfer
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_greedy_decoding.GreedyBatchedRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_greedy_decoding.GreedyBatchedRNNTInfer
     :show-inheritance:
     :members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_beam_decoding.BeamRNNTInfer
+.. autoclass:: nemo.collections.asr.parts.submodules.rnnt_beam_decoding.BeamRNNTInfer
     :show-inheritance:
     :members:
 
 Hypotheses
 ~~~~~~~~~~
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_utils.Hypothesis
+.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.Hypothesis
     :show-inheritance:
     :no-members:
 
-.. autoclass:: nemo.collections.asr.parts.rnnt_utils.NBestHypotheses
+.. autoclass:: nemo.collections.asr.parts.utils.rnnt_utils.NBestHypotheses
     :show-inheritance:
     :no-members:
diff --git a/docs/source/asr/configs.rst b/docs/source/asr/configs.rst
@@ -342,7 +342,7 @@ configuration is a shortform notation for Citrinet-21x5xC, such that ``B = 21``
 not be changed.
 
 To use Citrinet instead of QuartzNet, refer to the ``citrinet_512.yaml`` configuration found inside the ``examples/asr/conf/citrinet``
-directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.jasper.JasperBlock` as ``Jasper`` or
+directory. Citrinet is primarily comprised of the same :class:`~nemo.collections.asr.parts.submodules.jasper.JasperBlock` as ``Jasper`` or
 ``QuartzNet`.
 
 While the configs for Citrinet and QuartzNet are similar, we note the additional flags used for Citrinet below. Refer to the
@@ -442,7 +442,7 @@ changed slightly as Citrinet utilizes sub-word tokenization.
 .. note::
     The following information is relevant to any of the above models that implements its encoder as an :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder`, and utilizes the ``SqueezeExcite`` mechanism.
 
-The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.ASRModuleMixin.change_conv_asr_se_context_window`
+The ``SqueezeExcite`` block within a :class:`~nemo.collections.asr.modules.conv_asr.ConvASREncoder` network can be modified to utilize a different context window after the model has been instantiated (even after the model has been trained) so as to evaluate the model with limited context. This can be achieved using the :meth:`~nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin.change_conv_asr_se_context_window`
 
 .. code-block:: python
 
@@ -473,3 +473,56 @@ specify the tokenizer if you want to use sub-word encoding instead of character-
 
 The encoder section includes the details about the Conformer-CTC encoder architecture. You may find more information in the 
 config files and also :doc:`nemo.collections.asr.modules.ConformerEncoder<./api.html#nemo.collections.asr.modules.ConformerEncoder>`.
+
+
+Fine-tuning Configurations
+-------------------------
+
+All ASR scripts support easy fine-tuning by partially/fully loading the pretrained weights from a checkpoint into the currently instantiated model. Pre-trained weights can be provided in multiple ways -
+
+1) Providing a path to a NeMo model (via ``init_from_nemo_model``)
+2) Providing a name of a pretrained NeMo model (which will be downloaded via the cloud) (via ``init_from_pretrained_model``)
+3) Providing a path to a Pytorch Lightning checkpoint file (via ``init_from_ptl_ckpt``)
+
+Fine-tuning via a NeMo model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_nemo_model="<path to .nemo model file>"
+
+
+Fine-tuning via a NeMo pretrained model name
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_pretrained_model="<name of pretrained checkpoint>"
+
+Fine-tuning via a Pytorch Lightning checkpoint
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: sh
+
+    python examples/asr/script_to_<script_name>.py \
+        --config-path=<path to dir of configs> \
+        --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
+        trainer.gpus=-1 \
+        trainer.max_epochs=50 \
+        +init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
diff --git a/docs/source/asr/speaker_diarization/api.rst b/docs/source/asr/speaker_diarization/api.rst
@@ -12,6 +12,6 @@ Model Classes
 Mixins
 ------
 
-.. autoclass:: nemo.collections.asr.parts.mixins.DiarizationMixin
+.. autoclass:: nemo.collections.asr.parts.mixins.mixins.DiarizationMixin
     :show-inheritance:
     :members:
diff --git a/docs/source/asr/speaker_recognition/configs.rst b/docs/source/asr/speaker_recognition/configs.rst
@@ -80,7 +80,7 @@ minimum and maximum SNR specified with min_snr and max_snr respectively. This se
           max_snr_db: 15        
 
 
-See the :class:`nemo.collections.asr.parts.perturb.AudioAugmentor`  API section for more details.
+See the :class:`nemo.collections.asr.parts.preprocessing.perturb.AudioAugmentor`  API section for more details.
 
 
 Model Architecture Configurations

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -52,6 +52,7 @@
     'nemo_text_processing.inverse_text_normalization',  # Not installed automatically
     'nemo_text_processing.text_normalization',  # Not installed automatically
     'attr',  # attrdict in requirements, attr in import
+    'torchmetrics',  # inherited from PTL
 ]
 
 _skipped_autodoc_mock_imports = ['wrapt', 'numpy']

diff --git a/examples/asr/speech_to_label.py b/examples/asr/speech_to_label.py
@@ -102,20 +102,37 @@
     +trainer.precision=16 \
     +trainer.amp_level=O1  # needed if using PyTorch < 1.6
 
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/results.html#
+
 """
 import pytorch_lightning as pl
+from omegaconf import OmegaConf
 
 from nemo.collections.asr.models import EncDecClassificationModel
 from nemo.core.config import hydra_runner
+from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
 
 
 @hydra_runner(config_path="conf", config_name="matchboxnet_3x1x64_v1")
 def main(cfg):
+    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:

diff --git a/examples/asr/speech_to_text.py b/examples/asr/speech_to_text.py
@@ -12,21 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytorch_lightning as pl
-from omegaconf import OmegaConf
-
-from nemo.collections.asr.models import EncDecCTCModel
-from nemo.core.config import hydra_runner
-from nemo.utils import logging
-from nemo.utils.exp_manager import exp_manager
-
-
 """
+# Training the model
+
 Basic run (on CPU for 50 epochs):
     python examples/asr/speech_to_text.py \
-        model.train_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_train.json" \
-        model.validation_ds.manifest_filepath="/Users/okuchaiev/Data/an4_dataset/an4_val.json" \
-        hydra.run.dir="." \
+        # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
+        model.train_ds.manifest_filepath="<path to manifest file>" \
+        model.validation_ds.manifest_filepath="<path to manifest file>" \
         trainer.gpus=0 \
         trainer.max_epochs=50
 
@@ -41,19 +34,19 @@
 
 Override some args of optimizer:
     python speech_to_text.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.train_ds.manifest_filepath="./an4/train_manifest.json" \
     model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
-    hydra.run.dir="." \
     trainer.gpus=2 \
     trainer.max_epochs=2 \
     model.optim.args.betas=[0.8,0.5] \
     model.optim.args.weight_decay=0.0001
 
-Overide optimizer entirely
+Override optimizer entirely
     python speech_to_text.py \
+    # (Optional: --config-path=<path to dir of configs> --config-name=<name of config without .yaml>) \
     model.train_ds.manifest_filepath="./an4/train_manifest.json" \
     model.validation_ds.manifest_filepath="./an4/test_manifest.json" \
-    hydra.run.dir="." \
     trainer.gpus=2 \
     trainer.max_epochs=2 \
     model.optim.name=adamw \
@@ -62,16 +55,38 @@
     +model.optim.args.betas=[0.8,0.5]\
     +model.optim.args.weight_decay=0.0005
 
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html
+
 """
 
+import pytorch_lightning as pl
+from omegaconf import OmegaConf
+
+from nemo.collections.asr.models import EncDecCTCModel
+from nemo.core.config import hydra_runner
+from nemo.utils import logging
+from nemo.utils.exp_manager import exp_manager
+
 
 @hydra_runner(config_path="conf", config_name="config")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
     asr_model = EncDecCTCModel(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:

diff --git a/examples/asr/speech_to_text_bpe.py b/examples/asr/speech_to_text_bpe.py
@@ -50,7 +50,19 @@
     exp_manager.wandb_logger_kwargs.name="<Name of experiment>" \
     exp_manager.wandb_logger_kwargs.project="<Name of project>"
 ```
+
+# Fine-tune a model
+
+For documentation on fine-tuning this model, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations
+
+# Pretrained Models
+
+For documentation on existing pretrained models, please visit -
+https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/results.html
+
 """
+
 import pytorch_lightning as pl
 from omegaconf import OmegaConf
 
@@ -63,12 +75,14 @@
 @hydra_runner(config_path="experimental/configs/", config_name="config_bpe")
 def main(cfg):
     logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
-    print(OmegaConf.to_yaml(cfg))
+
     trainer = pl.Trainer(**cfg.trainer)
     exp_manager(trainer, cfg.get("exp_manager", None))
-
     asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer)
 
+    # Initialize the weights of the model from another model, if provided via config
+    asr_model.maybe_init_from_pretrained_checkpoint(cfg)
+
     trainer.fit(asr_model)
 
     if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: