From fd2eae5b42d15a9342c11a29321f2a37b64f528a Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 9 Mar 2023 16:34:40 -0800
Subject: [PATCH 1/2] Update exp manager docs and refactor loggers

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 docs/source/conf.py                      |   6 +-
 docs/source/core/adapters/api.rst        |   2 +-
 docs/source/core/adapters/components.rst |   4 +-
 docs/source/core/exp_manager.rst         | 118 ++++++++++++++++++++++-
 nemo/utils/exp_manager.py                |  20 +---
 nemo/utils/loggers/__init__.py           |   1 +
 nemo/utils/loggers/mlflow_logger.py      |  31 ++++++
 7 files changed, 160 insertions(+), 22 deletions(-)
 create mode 100644 nemo/utils/loggers/mlflow_logger.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 236c4f8a8af6..8ce8e146c2fd 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -42,18 +42,20 @@
     'torch.utils.data',
     'torch.utils.data.sampler',
     'torchtext',
+    'torchvision',
     'ruamel.yaml',  # ruamel.yaml has ., which is troublesome for this regex
     'hydra',  # hydra-core in requirements, hydra during import
     'dateutil',  # part of core python
     'transformers.tokenization_bert',  # has ., troublesome for this regex
     'megatron',  # megatron-lm in requirements, megatron in import
-    'sklearn',
+    'sklearn',  # scikit_learn in requirements, sklearn in import
     'nemo_text_processing.inverse_text_normalization',  # Not installed automatically
     'nemo_text_processing.text_normalization',  # Not installed automatically
     'attr',  # attrdict in requirements, attr in import
     'torchmetrics',  # inherited from PTL
+    'lightning_utilities',  # inherited from PTL
     'apex',
-    'joblib',
+    'joblib',  # inherited from optional code
     'IPython',
     'ipadic',
     'psutil',
diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst
index 8eea09f57e78..b0f2a8e13610 100644
--- a/docs/source/core/adapters/api.rst
+++ b/docs/source/core/adapters/api.rst
@@ -24,7 +24,7 @@ Adapter Networks
 ----------------
 
 
-.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule
+.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil
     :show-inheritance:
     :members:
     :member-order: bysource
diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst
index 79ca78801cd6..cc2ea0b525df 100644
--- a/docs/source/core/adapters/components.rst
+++ b/docs/source/core/adapters/components.rst
@@ -21,10 +21,10 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
 
 .. note::
 
-    All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AbstractAdapterModule` and should ideally have an equivalent DataClass config for easy instantiation !
+    All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AdapterModuleUtil` and should ideally have an equivalent DataClass config for easy instantiation !
 
 
-.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule
+.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil
     :show-inheritance:
     :members:
     :member-order: bysource
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
index d9bf4aa8e3e3..23874e5c8c13 100644
--- a/docs/source/core/exp_manager.rst
+++ b/docs/source/core/exp_manager.rst
@@ -11,7 +11,7 @@ To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_ma
 
 .. code-block:: python
 
-    exp_manager(trainer, cfg.get("exp_manager", None))
+    exp_dir = exp_manager(trainer, cfg.get("exp_manager", None))
 
 And is configurable via YAML with Hydra.
 
@@ -48,6 +48,9 @@ We can configure the ``ModelCheckpoint`` via YAML or CLI.
         # choose how many total checkpoints to save
         checkpoint_callback_params.save_top_k=5
 
+Resume Training
+---------------
+
 We can auto-resume training as well by configuring the ``exp_manager``. Being able to auto-resume is important when doing long training
 runs that are premptible or may be shut down before the training procedure has completed. To auto-resume training, set the following
 via YAML or CLI:
@@ -67,6 +70,110 @@ via YAML or CLI:
         exp_manager.version: my_experiment_version
 
 
+Experiment Loggers
+------------------
+
+Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following
+via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
+
+
+Weights and Biases (WandB)
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. _exp_manager_weights_biases-label:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_checkpoint_callback: True
+        create_wandb_logger: True
+        wandb_logger_kwargs:
+            name: ${name}
+            project: ${project}
+            entity: ${entity}
+            <Add any other arguments supported by WandB logger here>
+
+
+MLFlow
+~~~~~~
+
+.. _exp_manager_mlflow-label:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_checkpoint_callback: True
+        create_mlflow_logger: True
+        mlflow_logger_kwargs:
+            experiment_name: ${name}
+            tags:
+                <Any key:value pairs>
+            save_dir: './mlruns'
+            prefix: ''
+            artifact_location: None
+            # provide run_id if resuming a previously started run
+            run_id: Optional[str] = None
+
+DLLogger
+~~~~~~~~
+
+.. _exp_manager_dllogger-label:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_checkpoint_callback: True
+        create_dllogger_logger: True
+        dllogger_logger_kwargs:
+            verbose: False
+            stdout: False
+            json_file: "./dllogger.json"
+
+ClearML
+~~~~~~~
+
+.. _exp_manager_clearml-label:
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        create_checkpoint_callback: True
+        create_clearml_logger: True
+        clearml_logger_kwargs:
+            project: None  # name of the project
+            task: None  # optional name of task
+            connect_pytorch: False
+            model_name: None  # optional name of model
+            tags: None  # Should be a list of str
+            log_model: False  # log model to clearml server
+            log_cfg: False  # log config to clearml server
+            log_metrics: False  # log metrics to clearml server
+
+Exponential Moving Average
+--------------------------
+
+.. _exp_manager_ema-label:
+
+NeMo supports using exponential moving average (EMA) for model parameters. This can be useful for improving model generalization
+and stability. To use EMA, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.
+
+.. code-block:: yaml
+
+    exp_manager:
+        ...
+        # use exponential moving average for model parameters
+        ema:
+            enabled: True  # False by default
+            decay: 0.999  # decay rate
+            cpu_offload: False  # If EMA parameters should be offloaded to CPU to save GPU memory
+            every_n_steps: 1  # How often to update EMA weights
+            validate_original_weights: False  # Whether to use original weights for validation calculation or EMA weights
+
+
 .. _nemo_multirun-label:
 
 Hydra Multi-Run with NeMo
@@ -232,3 +339,12 @@ A simple solution is to finalize the hydra config before you call ``exp_manager(
         trainer = pl.Trainer(**cfg.trainer)
         exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
         ...
+
+
+ExpManagerConfig
+----------------
+
+.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig
+    :show-inheritance:
+    :members:
+    :member-order: bysource
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 7793ea7d4d7f..e8ce78cfbe2f 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -47,7 +47,7 @@
 from nemo.utils.exceptions import NeMoBaseException
 from nemo.utils.get_rank import is_global_rank_zero
 from nemo.utils.lightning_logger_patch import add_filehandlers_to_pl_logger
-from nemo.utils.loggers import ClearMLLogger, ClearMLParams, DLLogger, DLLoggerParams
+from nemo.utils.loggers import ClearMLLogger, ClearMLParams, DLLogger, DLLoggerParams, MLFlowParams
 from nemo.utils.model_utils import inject_model_parallel_rank, uninject_model_parallel_rank
 
 
@@ -106,21 +106,6 @@ class CallbackParams:
     save_on_train_epoch_end: Optional[bool] = False  # Save after training, not after validation
 
 
-@dataclass
-class MLFlowParams:
-    # name of experiment, if none, defaults to the globally set experiment name
-    experiment_name: Optional[str] = None
-    # no run_name because it's set by version
-    # local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir
-    tracking_uri: Optional[str] = None
-    tags: Optional[Dict[str, Any]] = None
-    save_dir: Optional[str] = "./mlruns"
-    prefix: str = ""
-    artifact_location: Optional[str] = None
-    # provide run_id if resuming a previously started run
-    run_id: Optional[str] = None
-
-
 @dataclass
 class StepTimingParams:
     reduction: Optional[str] = "mean"
@@ -141,6 +126,9 @@ class EMAParams:
 
 @dataclass
 class ExpManagerConfig:
+    """Experiment Manager config for validation of passed arguments.
+    """
+
     # Log dir creation parameters
     explicit_log_dir: Optional[str] = None
     exp_dir: Optional[str] = None
diff --git a/nemo/utils/loggers/__init__.py b/nemo/utils/loggers/__init__.py
index 4d44161527be..35e7b9baaa72 100644
--- a/nemo/utils/loggers/__init__.py
+++ b/nemo/utils/loggers/__init__.py
@@ -14,3 +14,4 @@
 
 from nemo.utils.loggers.clearml_logger import ClearMLLogger, ClearMLParams
 from nemo.utils.loggers.dllogger import DLLogger, DLLoggerParams
+from nemo.utils.loggers.mlflow_logger import MLFlowParams
diff --git a/nemo/utils/loggers/mlflow_logger.py b/nemo/utils/loggers/mlflow_logger.py
new file mode 100644
index 000000000000..9cb15814b293
--- /dev/null
+++ b/nemo/utils/loggers/mlflow_logger.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+@dataclass
+class MLFlowParams:
+    # name of experiment, if none, defaults to the globally set experiment name
+    experiment_name: Optional[str] = None
+    # no run_name because it's set by version
+    # local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir
+    tracking_uri: Optional[str] = None
+    tags: Optional[Dict[str, Any]] = None
+    save_dir: Optional[str] = "./mlruns"
+    prefix: str = ""
+    artifact_location: Optional[str] = None
+    # provide run_id if resuming a previously started run
+    run_id: Optional[str] = None

From 7aea9710119bc508a764c559f7b7ee4e6164bb61 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Thu, 9 Mar 2023 17:04:42 -0800
Subject: [PATCH 2/2] Update exportable docs

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 docs/source/core/api.rst        | 15 +++++++++++
 docs/source/core/export.rst     | 33 +++++++++++++------------
 nemo/core/classes/exportable.py | 44 +++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst
index e81092019e1e..aca0d5bec9c7 100644
--- a/docs/source/core/api.rst
+++ b/docs/source/core/api.rst
@@ -102,3 +102,18 @@ Experiment manager
     :show-inheritance:
     :members:
     :member-order: bysource
+
+.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
+
+Exportable
+----------
+
+.. autoclass:: nemo.core.classes.exportable.Exportable
+    :show-inheritance:
+    :members:
+    :member-order: bysource
+
diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst
index 2e3f05633e26..0e598e215dbf 100644
--- a/docs/source/core/export.rst
+++ b/docs/source/core/export.rst
@@ -5,7 +5,7 @@ Exporting Models
 ----------------
 
 Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Riva or Triton Inference Server.  
-Export interface is provided by the ``Exportable`` mix-in class. If a model extends ``Exportable``, it can be exported by:
+Export interface is provided by the :class:`~nemo.core.classes.exportable.Exportable` mix-in class. If a model extends :class:`~nemo.core.classes.exportable.Exportable`, it can be exported by:
 
 .. code-block:: Python
 
@@ -15,6 +15,8 @@ Export interface is provided by the ``Exportable`` mix-in class. If a model exte
    ...
 
    mymodel = MyExportableModel.from_pretrained(model_name="MyModelName")
+   model.eval()
+   model.to('cuda')  # or to('cpu') if you don't have GPU
    
    # exporting pre-trained model to ONNX file for deployment.	
    mymodel.export('mymodel.onnx', [options])
@@ -22,7 +24,7 @@ Export interface is provided by the ``Exportable`` mix-in class. If a model exte
 
 How to Use Model Export
 -----------------------
-The following arguments are for ``Exportable.export()``. In most cases, you should only supply the name of the output file and use all defaults:
+The following arguments are for :meth:`~nemo.core.classes.exportable.Exportable.export`. In most cases, you should only supply the name of the output file and use all defaults:
 
 .. code-block:: Python
 
@@ -30,30 +32,29 @@ The following arguments are for ``Exportable.export()``. In most cases, you shou
         self,
         output: str,
         input_example=None,
-        output_example=None,
         verbose=False,
-        export_params=True,
         do_constant_folding=True,
-        keep_initializers_as_inputs=False,
-        onnx_opset_version: int = 13,
-        try_script: bool = False,
-        set_eval: bool = True,
-        check_trace: bool = False,
-        use_dynamic_axes: bool = True,
+        onnx_opset_version=None,
+        check_trace: Union[bool, List[torch.Tensor]] = False,
         dynamic_axes=None,
         check_tolerance=0.01,
+        export_modules_as_functions=False,
+        keep_initializers_as_inputs=None,
     ):
 
-The ``output``, ``input_example``, ``output_example``, ``verbose``, ``export_params``, ``do_constant_folding``, ``keep_initializers_as_inputs``, ``onnx_opset_version``, ``set_eval`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation
-<https://pytorch.org/docs/stable/onnx.html#functions>`_.
+The ``output``, ``input_example``, ``verbose``, ``do_constant_folding``, ``onnx_opset_version`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation
+<https://pytorch.org/docs/stable/onnx.html#functions>`_. Note that if ``input_example`` is None, ``Exportable.input_example()`` is called.
 
-The file extension of the ``output`` parameter determines export format: ``.onnx->ONNX``, ``.pt`` or ``.ts`` -> ``TorchScript``. If ``input_example`` is None, ``Exportable.input_example()`` is called.
+The file extension of the ``output`` parameter determines export format:
+
+* ``.onnx->ONNX``
+* ``.pt`` or ``.ts`` -> ``TorchScript``.
+
+**TorchScript-specific**: By default, the module will undergo ``jit.trace()``. You may require to explicitly pass some modules under ``jit.script()`` so that they are correctly traced.The ``check_trace`` arg is passed through to ``jit.trace()``.
 
-**TorchScript-specific**: If ``try_script`` is ``True``, ``export()`` tries ``jit.script()`` before ``jit.trace()``.
-The ``check_trace`` arg is passed through to ``jit.trace()``.
 **ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc).
 
-If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to ``output_example`` using the ``check_tolerance`` argument. Note the higher tolerance default.
+If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to the exported model's output, using the ``check_tolerance`` argument. Note the higher tolerance default.
 
 
 How to Make Model Exportable
diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py
index 76de3edec13f..5c6c3e12a7eb 100644
--- a/nemo/core/classes/exportable.py
+++ b/nemo/core/classes/exportable.py
@@ -38,6 +38,13 @@ class Exportable(ABC):
     """
     This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT.
     It gives these entities ability to be exported for deployment to formats such as ONNX.
+
+    Usage:
+        # exporting pre-trained model to ONNX file for deployment.
+        model.eval()
+        model.to('cuda')  # or to('cpu') if you don't have GPU
+
+        model.export('mymodel.onnx', [options])  # all arguments apart from `output` are optional.
     """
 
     @property
@@ -61,6 +68,43 @@ def export(
         export_modules_as_functions=False,
         keep_initializers_as_inputs=None,
     ):
+        """
+        Exports the model to the specified format. The format is inferred from the file extension of the output file.
+
+        Args:
+        output (str): Output file name. File extension be .onnx, .pt, or .ts, and is used to select export
+            path of the model.
+        input_example (list or dict): Example input to the model's forward function. This is used to
+            trace the model and export it to ONNX/TorchScript. If the model takes multiple inputs, then input_example
+            should be a list of input examples. If the model takes named inputs, then input_example
+            should be a dictionary of input examples.
+        verbose (bool): If True, will print out a detailed description of the model's export steps, along with
+            the internal trace logs of the export process.
+        do_constant_folding (bool): If True, will execute constant folding optimization on the model's graph
+            before exporting. This is ONNX specific.
+        onnx_opset_version (int): The ONNX opset version to export the model to. If None, will use a reasonable
+            default version.
+        check_trace (bool): If True, will verify that the model's output matches the output of the traced
+            model, upto some tolerance.
+        dynamic_axes (dict): A dictionary mapping input and output names to their dynamic axes. This is
+            used to specify the dynamic axes of the model's inputs and outputs. If the model takes multiple inputs,
+            then dynamic_axes should be a list of dictionaries. If the model takes named inputs, then dynamic_axes
+            should be a dictionary of dictionaries. If None, will use the dynamic axes of the input_example
+            derived from the NeuralType of the input and output of the model.
+        check_tolerance (float): The tolerance to use when checking the model's output against the traced
+            model's output. This is only used if check_trace is True. Note the high tolerance is used because
+            the traced model is not guaranteed to be 100% accurate.
+        export_modules_as_functions (bool): If True, will export the model's submodules as functions. This is
+            ONNX specific.
+        keep_initializers_as_inputs (bool): If True, will keep the model's initializers as inputs in the onnx graph.
+            This is ONNX specific.
+
+        Returns:
+            A tuple of two outputs.
+            Item 0 in the output is a list of outputs, the outputs of each subnet exported.
+            Item 1 in the output is a list of string descriptions. The description of each subnet exported can be
+            used for logging purposes.
+        """
         all_out = []
         all_descr = []
         for subnet_name in self.list_export_subnets():