From d1908636217ff324f5f881f0221a955e261108d9 Mon Sep 17 00:00:00 2001 From: Harishankar G Date: Tue, 23 Jan 2024 14:22:46 +0530 Subject: [PATCH] Added support for neptune logger (#8210) * Added support for neptune logger Signed-off-by: Harishankar G * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixed config indentation in example config Signed-off-by: Harishankar G --------- Signed-off-by: Harishankar G Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: Pablo Garay --- docs/source/core/exp_manager.rst | 22 +++++++++++++- .../conf/megatron_gpt_config.yaml | 8 +++++ nemo/utils/exp_manager.py | 23 +++++++++++++- tests/core/test_exp_manager.py | 30 +++++++++++++++++++ 4 files changed, 81 insertions(+), 2 deletions(-) diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst index 6ef47b24b32f..b44d27c38b4b 100644 --- a/docs/source/core/exp_manager.rst +++ b/docs/source/core/exp_manager.rst @@ -73,7 +73,7 @@ via YAML or CLI: Experiment Loggers ------------------ -Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following +Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`. @@ -153,6 +153,26 @@ ClearML log_cfg: False # log config to clearml server log_metrics: False # log metrics to clearml server +Neptune +~~~~~~~ + +.. _exp_manager_neptune-label: + +.. code-block:: yaml + + exp_manager: + ... + create_checkpoint_callback: True + create_neptune_logger: false + neptune_logger_kwargs: + project: ${project} + name: ${name} + prefix: train + log_model_checkpoints: false # set to True if checkpoints need to be pushed to Neptune + tags: null # can specify as an array of strings in yaml array format + description: null + + Exponential Moving Average -------------------------- diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 32cab48a68c8..9bd50e14806d 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -32,6 +32,14 @@ exp_manager: wandb_logger_kwargs: project: null name: null + create_neptune_logger: false + neptune_logger_kwargs: + project: null + name: null + prefix: train + log_model_checkpoints: false + tags: null # can specify as an array of strings in yaml array format + description: null resume_if_exists: True resume_ignore_no_checkpoint: True resume_from_checkpoint: ${model.resume_from_checkpoint} diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index ba00b5bbbc8f..4bde204f2976 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -32,7 +32,7 @@ from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.callbacks.early_stopping import EarlyStopping from pytorch_lightning.callbacks.timer import Interval, Timer -from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger +from pytorch_lightning.loggers import MLFlowLogger, NeptuneLogger, TensorBoardLogger, WandbLogger from pytorch_lightning.loops import _TrainingEpochLoop from pytorch_lightning.strategies.ddp import DDPStrategy @@ -152,6 +152,8 @@ class ExpManagerConfig: dllogger_logger_kwargs: Optional[DLLoggerParams] = field(default_factory=lambda: DLLoggerParams()) create_clearml_logger: Optional[bool] = False clearml_logger_kwargs: Optional[ClearMLParams] = field(default_factory=lambda: ClearMLParams()) + create_neptune_logger: Optional[bool] = False + neptune_logger_kwargs: Optional[Dict[Any, Any]] = None # Checkpointing parameters create_checkpoint_callback: Optional[bool] = True checkpoint_callback_params: Optional[CallbackParams] = field(default_factory=lambda: CallbackParams()) @@ -422,6 +424,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo or cfg.create_mlflow_logger or cfg.create_dllogger_logger or cfg.create_clearml_logger + or cfg.create_neptune_logger ): configure_loggers( trainer, @@ -440,6 +443,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo cfg.dllogger_logger_kwargs, cfg.create_clearml_logger, cfg.clearml_logger_kwargs, + cfg.create_neptune_logger, + cfg.neptune_logger_kwargs, ) # add loggers timing callbacks @@ -815,6 +820,8 @@ def configure_loggers( dllogger_kwargs: dict, create_clearml_logger: bool, clearml_kwargs: dict, + create_neptune_logger: bool, + neptune_kwargs: dict, ): """ Creates TensorboardLogger and/or WandBLogger / MLFlowLogger / DLlogger / ClearMLLogger and attach them to trainer. @@ -872,6 +879,20 @@ def configure_loggers( logger_list.append(clearml_logger) logging.info("ClearMLLogger has been set up") + if create_neptune_logger: + if neptune_kwargs is None: + neptune_kwargs = {} + if "name" not in neptune_kwargs and "project" not in neptune_kwargs: + raise ValueError("name and project are required for neptune_logger") + if "api_key" not in neptune_kwargs and not os.getenv("NEPTUNE_API_TOKEN", None): + raise ValueError( + "either api_key should be set in neptune_kwargs or NEPTUNE_API_TOKEN should be set in environment variable for neptune_logger" + ) + neptune_logger = NeptuneLogger(**neptune_kwargs) + + logger_list.append(neptune_logger) + logging.info("NeptuneLogger has been set up") + trainer._logger_connector.configure_logger(logger_list) diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index d69dc7804e32..7a8eec669d40 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -195,6 +195,36 @@ def test_trainer_loggers(self, tmp_path): ) assert isinstance(test_trainer.logger, pl.loggers.WandbLogger) + @pytest.mark.unit + def test_trainer_neptune_logger(self, tmp_path): + pytest.importorskip("neptune", reason="could not import `neptune`, use `pip install neptune` to run this test") + + test_trainer = pl.Trainer(accelerator='cpu', logger=False) + # Check that a create_neptune_logger=True errors out unless neptune_logger_kwargs is passed. + with pytest.raises(ValueError): + _ = exp_manager( + test_trainer, + { + "create_tensorboard_logger": False, + "create_checkpoint_callback": False, + "exp_dir": str(tmp_path), + "create_neptune_logger": True, + }, + ) + # Check that a NeptuneLogger is attached to logger if create_neptune_logger=True and neptune_logger_kwargs has name + # and project + _ = exp_manager( + test_trainer, + { + "create_tensorboard_logger": False, + "create_checkpoint_callback": False, + "exp_dir": str(tmp_path), + "create_neptune_logger": True, + "neptune_logger_kwargs": {"name": "", "project": "", "api_key": ""}, + }, + ) + assert isinstance(test_trainer.logger, pl.loggers.NeptuneLogger) + @pytest.mark.unit def test_checkpoint_configurations(self): """ Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but