Skip to content

Commit

Permalink
Added support for neptune logger (#8210)
Browse files Browse the repository at this point in the history
* Added support for neptune logger

Signed-off-by: Harishankar G <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fixed config indentation in example config

Signed-off-by: Harishankar G <[email protected]>

---------

Signed-off-by: Harishankar G <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: Pablo Garay <[email protected]>
  • Loading branch information
2 people authored and pablo-garay committed Mar 19, 2024
1 parent cb73f3a commit d190863
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 2 deletions.
22 changes: 21 additions & 1 deletion docs/source/core/exp_manager.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ via YAML or CLI:
Experiment Loggers
------------------

Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow and DLLogger. To use these loggers, simply set the following
Alongside Tensorboard, NeMo also supports Weights and Biases, MLFlow, DLLogger, ClearML and NeptuneLogger. To use these loggers, simply set the following
via YAML or :class:`~nemo.utils.exp_manager.ExpManagerConfig`.


Expand Down Expand Up @@ -153,6 +153,26 @@ ClearML
log_cfg: False # log config to clearml server
log_metrics: False # log metrics to clearml server
Neptune
~~~~~~~

.. _exp_manager_neptune-label:

.. code-block:: yaml
exp_manager:
...
create_checkpoint_callback: True
create_neptune_logger: false
neptune_logger_kwargs:
project: ${project}
name: ${name}
prefix: train
log_model_checkpoints: false # set to True if checkpoints need to be pushed to Neptune
tags: null # can specify as an array of strings in yaml array format
description: null
<Add any other arguments supported by Neptune logger here>
Exponential Moving Average
--------------------------

Expand Down
8 changes: 8 additions & 0 deletions examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ exp_manager:
wandb_logger_kwargs:
project: null
name: null
create_neptune_logger: false
neptune_logger_kwargs:
project: null
name: null
prefix: train
log_model_checkpoints: false
tags: null # can specify as an array of strings in yaml array format
description: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
Expand Down
23 changes: 22 additions & 1 deletion nemo/utils/exp_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.timer import Interval, Timer
from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger
from pytorch_lightning.loggers import MLFlowLogger, NeptuneLogger, TensorBoardLogger, WandbLogger
from pytorch_lightning.loops import _TrainingEpochLoop
from pytorch_lightning.strategies.ddp import DDPStrategy

Expand Down Expand Up @@ -152,6 +152,8 @@ class ExpManagerConfig:
dllogger_logger_kwargs: Optional[DLLoggerParams] = field(default_factory=lambda: DLLoggerParams())
create_clearml_logger: Optional[bool] = False
clearml_logger_kwargs: Optional[ClearMLParams] = field(default_factory=lambda: ClearMLParams())
create_neptune_logger: Optional[bool] = False
neptune_logger_kwargs: Optional[Dict[Any, Any]] = None
# Checkpointing parameters
create_checkpoint_callback: Optional[bool] = True
checkpoint_callback_params: Optional[CallbackParams] = field(default_factory=lambda: CallbackParams())
Expand Down Expand Up @@ -422,6 +424,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
or cfg.create_mlflow_logger
or cfg.create_dllogger_logger
or cfg.create_clearml_logger
or cfg.create_neptune_logger
):
configure_loggers(
trainer,
Expand All @@ -440,6 +443,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
cfg.dllogger_logger_kwargs,
cfg.create_clearml_logger,
cfg.clearml_logger_kwargs,
cfg.create_neptune_logger,
cfg.neptune_logger_kwargs,
)

# add loggers timing callbacks
Expand Down Expand Up @@ -815,6 +820,8 @@ def configure_loggers(
dllogger_kwargs: dict,
create_clearml_logger: bool,
clearml_kwargs: dict,
create_neptune_logger: bool,
neptune_kwargs: dict,
):
"""
Creates TensorboardLogger and/or WandBLogger / MLFlowLogger / DLlogger / ClearMLLogger and attach them to trainer.
Expand Down Expand Up @@ -872,6 +879,20 @@ def configure_loggers(
logger_list.append(clearml_logger)
logging.info("ClearMLLogger has been set up")

if create_neptune_logger:
if neptune_kwargs is None:
neptune_kwargs = {}
if "name" not in neptune_kwargs and "project" not in neptune_kwargs:
raise ValueError("name and project are required for neptune_logger")
if "api_key" not in neptune_kwargs and not os.getenv("NEPTUNE_API_TOKEN", None):
raise ValueError(
"either api_key should be set in neptune_kwargs or NEPTUNE_API_TOKEN should be set in environment variable for neptune_logger"
)
neptune_logger = NeptuneLogger(**neptune_kwargs)

logger_list.append(neptune_logger)
logging.info("NeptuneLogger has been set up")

trainer._logger_connector.configure_logger(logger_list)


Expand Down
30 changes: 30 additions & 0 deletions tests/core/test_exp_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,36 @@ def test_trainer_loggers(self, tmp_path):
)
assert isinstance(test_trainer.logger, pl.loggers.WandbLogger)

@pytest.mark.unit
def test_trainer_neptune_logger(self, tmp_path):
pytest.importorskip("neptune", reason="could not import `neptune`, use `pip install neptune` to run this test")

test_trainer = pl.Trainer(accelerator='cpu', logger=False)
# Check that a create_neptune_logger=True errors out unless neptune_logger_kwargs is passed.
with pytest.raises(ValueError):
_ = exp_manager(
test_trainer,
{
"create_tensorboard_logger": False,
"create_checkpoint_callback": False,
"exp_dir": str(tmp_path),
"create_neptune_logger": True,
},
)
# Check that a NeptuneLogger is attached to logger if create_neptune_logger=True and neptune_logger_kwargs has name
# and project
_ = exp_manager(
test_trainer,
{
"create_tensorboard_logger": False,
"create_checkpoint_callback": False,
"exp_dir": str(tmp_path),
"create_neptune_logger": True,
"neptune_logger_kwargs": {"name": "", "project": "", "api_key": ""},
},
)
assert isinstance(test_trainer.logger, pl.loggers.NeptuneLogger)

@pytest.mark.unit
def test_checkpoint_configurations(self):
""" Test that trainer creating modelcheckpoint and asking exp_manager to do it too results in errors, but
Expand Down

0 comments on commit d190863

Please sign in to comment.