From fd2eae5b42d15a9342c11a29321f2a37b64f528a Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 9 Mar 2023 16:34:40 -0800 Subject: [PATCH 1/2] Update exp manager docs and refactor loggers Signed-off-by: smajumdar --- docs/source/conf.py | 6 +- docs/source/core/adapters/api.rst | 2 +- docs/source/core/adapters/components.rst | 4 +- docs/source/core/exp_manager.rst | 118 ++++++++++++++++++++++- nemo/utils/exp_manager.py | 20 +--- nemo/utils/loggers/__init__.py | 1 + nemo/utils/loggers/mlflow_logger.py | 31 ++++++ 7 files changed, 160 insertions(+), 22 deletions(-) create mode 100644 nemo/utils/loggers/mlflow_logger.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 236c4f8a8af6..8ce8e146c2fd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -42,18 +42,20 @@ 'torch.utils.data', 'torch.utils.data.sampler', 'torchtext', + 'torchvision', 'ruamel.yaml', # ruamel.yaml has ., which is troublesome for this regex 'hydra', # hydra-core in requirements, hydra during import 'dateutil', # part of core python 'transformers.tokenization_bert', # has ., troublesome for this regex 'megatron', # megatron-lm in requirements, megatron in import - 'sklearn', + 'sklearn', # scikit_learn in requirements, sklearn in import 'nemo_text_processing.inverse_text_normalization', # Not installed automatically 'nemo_text_processing.text_normalization', # Not installed automatically 'attr', # attrdict in requirements, attr in import 'torchmetrics', # inherited from PTL + 'lightning_utilities', # inherited from PTL 'apex', - 'joblib', + 'joblib', # inherited from optional code 'IPython', 'ipadic', 'psutil', diff --git a/docs/source/core/adapters/api.rst b/docs/source/core/adapters/api.rst index 8eea09f57e78..b0f2a8e13610 100644 --- a/docs/source/core/adapters/api.rst +++ b/docs/source/core/adapters/api.rst @@ -24,7 +24,7 @@ Adapter Networks ---------------- -.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule +.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil :show-inheritance: :members: :member-order: bysource diff --git a/docs/source/core/adapters/components.rst b/docs/source/core/adapters/components.rst index 79ca78801cd6..cc2ea0b525df 100644 --- a/docs/source/core/adapters/components.rst +++ b/docs/source/core/adapters/components.rst @@ -21,10 +21,10 @@ Adapter modules represent the functional form of the adapter. We discuss an exam .. note:: - All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AbstractAdapterModule` and should ideally have an equivalent DataClass config for easy instantiation ! + All adapter modules must extend :class:`~nemo.collections.common.parts.adapter_modules.AdapterModuleUtil` and should ideally have an equivalent DataClass config for easy instantiation ! -.. autoclass:: nemo.collections.common.parts.adapter_modules.AbstractAdapterModule +.. autoclass:: nemo.collections.common.parts.adapter_modules.AdapterModuleUtil :show-inheritance: :members: :member-order: bysource diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst index d9bf4aa8e3e3..23874e5c8c13 100644 --- a/docs/source/core/exp_manager.rst +++ b/docs/source/core/exp_manager.rst @@ -11,7 +11,7 @@ To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_ma .. code-block:: python - exp_manager(trainer, cfg.get("exp_manager", None)) + exp_dir = exp_manager(trainer, cfg.get("exp_manager", None)) And is configurable via YAML with Hydra. @@ -48,6 +48,9 @@ We can configure the ``ModelCheckpoint`` via YAML or CLI. # choose how many total checkpoints to save checkpoint_callback_params.save_top_k=5 +Resume Training +--------------- + We can auto-resume training as well by configuring the ``exp_manager``. Being able to auto-resume is important when doing long training runs that are premptible or may be shut down before the training procedure has completed. To auto-resume training, set the following via YAML or CLI: @@ -67,6 +70,110 @@ via YAML or CLI: exp_manager.version: my_experiment_version +Experiment Loggers +------------------ + +Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following +via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. + + +Weights and Biases (WandB) +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _exp_manager_weights_biases-label: + +.. code-block:: yaml + + exp_manager: + ... + create_checkpoint_callback: True + create_wandb_logger: True + wandb_logger_kwargs: + name: ${name} + project: ${project} + entity: ${entity} + + + +MLFlow +~~~~~~ + +.. _exp_manager_mlflow-label: + +.. code-block:: yaml + + exp_manager: + ... + create_checkpoint_callback: True + create_mlflow_logger: True + mlflow_logger_kwargs: + experiment_name: ${name} + tags: + + save_dir: './mlruns' + prefix: '' + artifact_location: None + # provide run_id if resuming a previously started run + run_id: Optional[str] = None + +DLLogger +~~~~~~~~ + +.. _exp_manager_dllogger-label: + +.. code-block:: yaml + + exp_manager: + ... + create_checkpoint_callback: True + create_dllogger_logger: True + dllogger_logger_kwargs: + verbose: False + stdout: False + json_file: "./dllogger.json" + +ClearML +~~~~~~~ + +.. _exp_manager_clearml-label: + +.. code-block:: yaml + + exp_manager: + ... + create_checkpoint_callback: True + create_clearml_logger: True + clearml_logger_kwargs: + project: None # name of the project + task: None # optional name of task + connect_pytorch: False + model_name: None # optional name of model + tags: None # Should be a list of str + log_model: False # log model to clearml server + log_cfg: False # log config to clearml server + log_metrics: False # log metrics to clearml server + +Exponential Moving Average +-------------------------- + +.. _exp_manager_ema-label: + +NeMo supports using exponential moving average (EMA) for model parameters. This can be useful for improving model generalization +and stability. To use EMA, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. + +.. code-block:: yaml + + exp_manager: + ... + # use exponential moving average for model parameters + ema: + enabled: True # False by default + decay: 0.999 # decay rate + cpu_offload: False # If EMA parameters should be offloaded to CPU to save GPU memory + every_n_steps: 1 # How often to update EMA weights + validate_original_weights: False # Whether to use original weights for validation calculation or EMA weights + + .. _nemo_multirun-label: Hydra Multi-Run with NeMo @@ -232,3 +339,12 @@ A simple solution is to finalize the hydra config before you call ``exp_manager( trainer = pl.Trainer(**cfg.trainer) exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) ... + + +ExpManagerConfig +---------------- + +.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig + :show-inheritance: + :members: + :member-order: bysource diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 7793ea7d4d7f..e8ce78cfbe2f 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -47,7 +47,7 @@ from nemo.utils.exceptions import NeMoBaseException from nemo.utils.get_rank import is_global_rank_zero from nemo.utils.lightning_logger_patch import add_filehandlers_to_pl_logger -from nemo.utils.loggers import ClearMLLogger, ClearMLParams, DLLogger, DLLoggerParams +from nemo.utils.loggers import ClearMLLogger, ClearMLParams, DLLogger, DLLoggerParams, MLFlowParams from nemo.utils.model_utils import inject_model_parallel_rank, uninject_model_parallel_rank @@ -106,21 +106,6 @@ class CallbackParams: save_on_train_epoch_end: Optional[bool] = False # Save after training, not after validation -@dataclass -class MLFlowParams: - # name of experiment, if none, defaults to the globally set experiment name - experiment_name: Optional[str] = None - # no run_name because it's set by version - # local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir - tracking_uri: Optional[str] = None - tags: Optional[Dict[str, Any]] = None - save_dir: Optional[str] = "./mlruns" - prefix: str = "" - artifact_location: Optional[str] = None - # provide run_id if resuming a previously started run - run_id: Optional[str] = None - - @dataclass class StepTimingParams: reduction: Optional[str] = "mean" @@ -141,6 +126,9 @@ class EMAParams: @dataclass class ExpManagerConfig: + """Experiment Manager config for validation of passed arguments. + """ + # Log dir creation parameters explicit_log_dir: Optional[str] = None exp_dir: Optional[str] = None diff --git a/nemo/utils/loggers/__init__.py b/nemo/utils/loggers/__init__.py index 4d44161527be..35e7b9baaa72 100644 --- a/nemo/utils/loggers/__init__.py +++ b/nemo/utils/loggers/__init__.py @@ -14,3 +14,4 @@ from nemo.utils.loggers.clearml_logger import ClearMLLogger, ClearMLParams from nemo.utils.loggers.dllogger import DLLogger, DLLoggerParams +from nemo.utils.loggers.mlflow_logger import MLFlowParams diff --git a/nemo/utils/loggers/mlflow_logger.py b/nemo/utils/loggers/mlflow_logger.py new file mode 100644 index 000000000000..9cb15814b293 --- /dev/null +++ b/nemo/utils/loggers/mlflow_logger.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class MLFlowParams: + # name of experiment, if none, defaults to the globally set experiment name + experiment_name: Optional[str] = None + # no run_name because it's set by version + # local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir + tracking_uri: Optional[str] = None + tags: Optional[Dict[str, Any]] = None + save_dir: Optional[str] = "./mlruns" + prefix: str = "" + artifact_location: Optional[str] = None + # provide run_id if resuming a previously started run + run_id: Optional[str] = None From 7aea9710119bc508a764c559f7b7ee4e6164bb61 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Thu, 9 Mar 2023 17:04:42 -0800 Subject: [PATCH 2/2] Update exportable docs Signed-off-by: smajumdar --- docs/source/core/api.rst | 15 +++++++++++ docs/source/core/export.rst | 33 +++++++++++++------------ nemo/core/classes/exportable.py | 44 +++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 16 deletions(-) diff --git a/docs/source/core/api.rst b/docs/source/core/api.rst index e81092019e1e..aca0d5bec9c7 100644 --- a/docs/source/core/api.rst +++ b/docs/source/core/api.rst @@ -102,3 +102,18 @@ Experiment manager :show-inheritance: :members: :member-order: bysource + +.. autoclass:: nemo.utils.exp_manager.ExpManagerConfig + :show-inheritance: + :members: + :member-order: bysource + + +Exportable +---------- + +.. autoclass:: nemo.core.classes.exportable.Exportable + :show-inheritance: + :members: + :member-order: bysource + diff --git a/docs/source/core/export.rst b/docs/source/core/export.rst index 2e3f05633e26..0e598e215dbf 100644 --- a/docs/source/core/export.rst +++ b/docs/source/core/export.rst @@ -5,7 +5,7 @@ Exporting Models ---------------- Most of the NeMo models can be exported to ONNX or TorchScript to be deployed for inference in optimized execution environments, such as Riva or Triton Inference Server. -Export interface is provided by the ``Exportable`` mix-in class. If a model extends ``Exportable``, it can be exported by: +Export interface is provided by the :class:`~nemo.core.classes.exportable.Exportable` mix-in class. If a model extends :class:`~nemo.core.classes.exportable.Exportable`, it can be exported by: .. code-block:: Python @@ -15,6 +15,8 @@ Export interface is provided by the ``Exportable`` mix-in class. If a model exte ... mymodel = MyExportableModel.from_pretrained(model_name="MyModelName") + model.eval() + model.to('cuda') # or to('cpu') if you don't have GPU # exporting pre-trained model to ONNX file for deployment. mymodel.export('mymodel.onnx', [options]) @@ -22,7 +24,7 @@ Export interface is provided by the ``Exportable`` mix-in class. If a model exte How to Use Model Export ----------------------- -The following arguments are for ``Exportable.export()``. In most cases, you should only supply the name of the output file and use all defaults: +The following arguments are for :meth:`~nemo.core.classes.exportable.Exportable.export`. In most cases, you should only supply the name of the output file and use all defaults: .. code-block:: Python @@ -30,30 +32,29 @@ The following arguments are for ``Exportable.export()``. In most cases, you shou self, output: str, input_example=None, - output_example=None, verbose=False, - export_params=True, do_constant_folding=True, - keep_initializers_as_inputs=False, - onnx_opset_version: int = 13, - try_script: bool = False, - set_eval: bool = True, - check_trace: bool = False, - use_dynamic_axes: bool = True, + onnx_opset_version=None, + check_trace: Union[bool, List[torch.Tensor]] = False, dynamic_axes=None, check_tolerance=0.01, + export_modules_as_functions=False, + keep_initializers_as_inputs=None, ): -The ``output``, ``input_example``, ``output_example``, ``verbose``, ``export_params``, ``do_constant_folding``, ``keep_initializers_as_inputs``, ``onnx_opset_version``, ``set_eval`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation -`_. +The ``output``, ``input_example``, ``verbose``, ``do_constant_folding``, ``onnx_opset_version`` options have the same semantics as in Pytorch ``onnx.export()`` and ``jit.trace()`` functions and are passed through. For more information about Pytorch's``onnx.export()``, refer to the `torch.onnx functions documentation +`_. Note that if ``input_example`` is None, ``Exportable.input_example()`` is called. -The file extension of the ``output`` parameter determines export format: ``.onnx->ONNX``, ``.pt`` or ``.ts`` -> ``TorchScript``. If ``input_example`` is None, ``Exportable.input_example()`` is called. +The file extension of the ``output`` parameter determines export format: + +* ``.onnx->ONNX`` +* ``.pt`` or ``.ts`` -> ``TorchScript``. + +**TorchScript-specific**: By default, the module will undergo ``jit.trace()``. You may require to explicitly pass some modules under ``jit.script()`` so that they are correctly traced.The ``check_trace`` arg is passed through to ``jit.trace()``. -**TorchScript-specific**: If ``try_script`` is ``True``, ``export()`` tries ``jit.script()`` before ``jit.trace()``. -The ``check_trace`` arg is passed through to ``jit.trace()``. **ONNX-specific**: If ``use_dynamic_axes`` is True, ``onnx.export()`` is called with dynamic axes. If ``dynamic_axes`` is ``None``, they are inferred from the model's ``input_types`` definition (batch dimension is dynamic, and so is duration etc). -If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to ``output_example`` using the ``check_tolerance`` argument. Note the higher tolerance default. +If ``check_trace`` is ``True``, the resulting ONNX also runs on ``input_example`` and the results compared to the exported model's output, using the ``check_tolerance`` argument. Note the higher tolerance default. How to Make Model Exportable diff --git a/nemo/core/classes/exportable.py b/nemo/core/classes/exportable.py index 76de3edec13f..5c6c3e12a7eb 100644 --- a/nemo/core/classes/exportable.py +++ b/nemo/core/classes/exportable.py @@ -38,6 +38,13 @@ class Exportable(ABC): """ This Interface should be implemented by particular classes derived from nemo.core.NeuralModule or nemo.core.ModelPT. It gives these entities ability to be exported for deployment to formats such as ONNX. + + Usage: + # exporting pre-trained model to ONNX file for deployment. + model.eval() + model.to('cuda') # or to('cpu') if you don't have GPU + + model.export('mymodel.onnx', [options]) # all arguments apart from `output` are optional. """ @property @@ -61,6 +68,43 @@ def export( export_modules_as_functions=False, keep_initializers_as_inputs=None, ): + """ + Exports the model to the specified format. The format is inferred from the file extension of the output file. + + Args: + output (str): Output file name. File extension be .onnx, .pt, or .ts, and is used to select export + path of the model. + input_example (list or dict): Example input to the model's forward function. This is used to + trace the model and export it to ONNX/TorchScript. If the model takes multiple inputs, then input_example + should be a list of input examples. If the model takes named inputs, then input_example + should be a dictionary of input examples. + verbose (bool): If True, will print out a detailed description of the model's export steps, along with + the internal trace logs of the export process. + do_constant_folding (bool): If True, will execute constant folding optimization on the model's graph + before exporting. This is ONNX specific. + onnx_opset_version (int): The ONNX opset version to export the model to. If None, will use a reasonable + default version. + check_trace (bool): If True, will verify that the model's output matches the output of the traced + model, upto some tolerance. + dynamic_axes (dict): A dictionary mapping input and output names to their dynamic axes. This is + used to specify the dynamic axes of the model's inputs and outputs. If the model takes multiple inputs, + then dynamic_axes should be a list of dictionaries. If the model takes named inputs, then dynamic_axes + should be a dictionary of dictionaries. If None, will use the dynamic axes of the input_example + derived from the NeuralType of the input and output of the model. + check_tolerance (float): The tolerance to use when checking the model's output against the traced + model's output. This is only used if check_trace is True. Note the high tolerance is used because + the traced model is not guaranteed to be 100% accurate. + export_modules_as_functions (bool): If True, will export the model's submodules as functions. This is + ONNX specific. + keep_initializers_as_inputs (bool): If True, will keep the model's initializers as inputs in the onnx graph. + This is ONNX specific. + + Returns: + A tuple of two outputs. + Item 0 in the output is a list of outputs, the outputs of each subnet exported. + Item 1 in the output is a list of string descriptions. The description of each subnet exported can be + used for logging purposes. + """ all_out = [] all_descr = [] for subnet_name in self.list_export_subnets():