Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename log_save_interval, row_log_interval #3748

Merged
merged 18 commits into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- Change Trainer arguments `row_log_interval` and `log_save_interval` to
`log_every_n_steps` and `flush_logs_every_n_steps`, respectively.
Borda marked this conversation as resolved.
Show resolved Hide resolved

- Changed `LearningRateLogger` to `LearningRateMonitor` ([#3251](https://github.com/PyTorchLightning/pytorch-lightning/pull/3251))

- Used `fsspec` instead of `gfile` for all IO ([#3320](https://github.com/PyTorchLightning/pytorch-lightning/pull/3320))
Expand Down
12 changes: 6 additions & 6 deletions pytorch_lightning/trainer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,15 +574,15 @@ def on_train_end(self, trainer, pl_module):

.. note:: Might slow performance because it uses the output of nvidia-smi.

log_save_interval
^^^^^^^^^^^^^^^^^
flush_logs_every_n_steps
^^^^^^^^^^^^^^^^^^^^^^^^

Writes logs to disk this often.

.. testcode::

# default used by the Trainer
trainer = Trainer(log_save_interval=100)
trainer = Trainer(flush_logs_every_n_steps=100)

See Also:
- :ref:`Experiment Reporting <experiment_reporting>`
Expand Down Expand Up @@ -909,15 +909,15 @@ def on_train_end(self, trainer, pl_module):
# resume from a specific checkpoint
trainer = Trainer(resume_from_checkpoint='some/path/to/my_checkpoint.ckpt')

row_log_interval
^^^^^^^^^^^^^^^^
log_every_n_steps
^^^^^^^^^^^^^^^^^

How often to add logging rows (does not write to disk)

.. testcode::

# default used by the Trainer
trainer = Trainer(row_log_interval=50)
trainer = Trainer(log_every_n_steps=50)

See Also:
- :ref:`Experiment Reporting <experiment_reporting>`
Expand Down
8 changes: 4 additions & 4 deletions pytorch_lightning/trainer/connectors/logger_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ def __init__(self, trainer):
self.progress_bar_metrics = {}
self.eval_loop_results = []

def on_trainer_init(self, logger, log_save_interval, row_log_interval):
def on_trainer_init(self, logger, flush_logs_every_n_steps, log_every_n_steps):
# logging
self.configure_logger(logger)
self.trainer.log_save_interval = log_save_interval
self.trainer.row_log_interval = row_log_interval
self.trainer.flush_logs_every_n_steps = flush_logs_every_n_steps
self.trainer.log_every_n_steps = log_every_n_steps

def configure_logger(self, logger):
if logger is True:
Expand Down Expand Up @@ -470,7 +470,7 @@ def __gather_result_across_time_and_optimizers(self, epoch_output):
def log_train_step_metrics(self, batch_output):
# when metrics should be logged
should_log_metrics = (
(self.trainer.global_step + 1) % self.trainer.row_log_interval == 0 or self.trainer.should_stop
(self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0 or self.trainer.should_stop
)
if should_log_metrics or self.trainer.fast_dev_run:
# logs user requested information to logger
Expand Down
49 changes: 28 additions & 21 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,10 @@ def __init__(
limit_val_batches: Union[int, float] = 1.0,
limit_test_batches: Union[int, float] = 1.0,
val_check_interval: Union[int, float] = 1.0,
log_save_interval: int = 100,
row_log_interval: int = 50,
log_save_interval: Optional[int] = None, # backward compatible, todo: remove
teddykoker marked this conversation as resolved.
Show resolved Hide resolved
row_log_interval: Optional[int] = None, # backward compatible, todo: remove
flush_logs_every_n_steps: int = 100,
log_every_n_steps: int = 50,
distributed_backend: Optional[str] = None,
sync_batchnorm: bool = False,
precision: int = 32,
Expand Down Expand Up @@ -177,6 +179,8 @@ def __init__(

fast_dev_run: runs 1 batch of train, test and val to find any bugs (ie: a sort of unit test).

flush_logs_every_n_steps: How often to flush logs to disk (defaults to every 100 steps).

gpus: number of gpus to train on (int) or which GPUs to train on (list or str) applied per node

gradient_clip_val: 0 means don't clip.
Expand All @@ -191,7 +195,7 @@ def __init__(

log_gpu_memory: None, 'min_max', 'all'. Might slow performance

log_save_interval: Writes logs to disk this often
log_every_n_steps: How often to log within steps (defaults to every 50 steps).
Borda marked this conversation as resolved.
Show resolved Hide resolved

prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data.
Otherwise only NODE_RANK=0, LOCAL_RANK=0 will prepare data
Expand Down Expand Up @@ -230,8 +234,6 @@ def __init__(
resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
This can be a URL.

row_log_interval: How often to add logging rows (does not write to disk)

sync_batchnorm: Synchronize batch norm layers between process groups/whole world.

terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the
Expand All @@ -257,6 +259,15 @@ def __init__(
"""
super().__init__()

# deprecation warnings
if row_log_interval is not None:
teddykoker marked this conversation as resolved.
Show resolved Hide resolved
warnings.warn("row_log_interval is deprecated, use log_every_n_steps instead", DeprecationWarning)
teddykoker marked this conversation as resolved.
Show resolved Hide resolved
log_every_n_steps = row_log_interval

if log_save_interval is not None:
warnings.warn("log_save_interval is deprecated, use flush_logs_every_n_steps instead", DeprecationWarning)
teddykoker marked this conversation as resolved.
Show resolved Hide resolved
flush_logs_every_n_steps = log_save_interval

# init connectors
self.dev_debugger = InternalDebugger(self)
self.config_validator = ConfigValidator(self)
Expand Down Expand Up @@ -291,7 +302,7 @@ def __init__(
process_position,
default_root_dir,
weights_save_path,
resume_from_checkpoint
resume_from_checkpoint,
)

# hook
Expand All @@ -302,18 +313,12 @@ def __init__(

# init data flags
self.data_connector.on_trainer_init(
check_val_every_n_epoch,
reload_dataloaders_every_epoch,
prepare_data_per_node
check_val_every_n_epoch, reload_dataloaders_every_epoch, prepare_data_per_node
)

# init training tricks
self.training_tricks_connector.on_trainer_init(
gradient_clip_val,
track_grad_norm,
accumulate_grad_batches,
truncated_bptt_steps,
terminate_on_nan
gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan
)

# init accelerator related flags
Expand All @@ -328,7 +333,7 @@ def __init__(
sync_batchnorm,
benchmark,
replace_sampler_ddp,
deterministic
deterministic,
)

# init train loop related flags
Expand All @@ -342,7 +347,7 @@ def __init__(
self.profile_connector.on_trainer_init(profiler)

# init logger flags
self.logger_connector.on_trainer_init(logger, log_save_interval, row_log_interval)
self.logger_connector.on_trainer_init(logger, flush_logs_every_n_steps, log_every_n_steps)

# init debugging flags
self.debugging_connector.on_init_start(
Expand All @@ -352,7 +357,7 @@ def __init__(
limit_test_batches,
val_check_interval,
overfit_batches,
fast_dev_run
fast_dev_run,
)

# set precision
Expand Down Expand Up @@ -502,13 +507,15 @@ def train(self):
met_min_steps = self.global_step >= self.min_steps if self.min_steps else True

if self.should_stop:
if (met_min_epochs and met_min_steps):
if met_min_epochs and met_min_steps:
self.train_loop.on_train_end()
return
else:
log.info('Trainer was signaled to stop but required minimum epochs'
f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
' not been met. Training will continue...')
log.info(
'Trainer was signaled to stop but required minimum epochs'
f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
' not been met. Training will continue...'
)

# hook
self.train_loop.on_train_end()
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def on_before_backward(self, batch_idx, optimizer):

def _track_gradient_norm(self):
grad_norm_dict = {}
if (self.trainer.global_step + 1) % self.trainer.row_log_interval == 0:
if (self.trainer.global_step + 1) % self.trainer.log_every_n_steps == 0:
Borda marked this conversation as resolved.
Show resolved Hide resolved
if float(self.trainer.track_grad_norm) > 0:
model = self.trainer.get_model()
grad_norm_dict = model.grad_norm(self.trainer.track_grad_norm)
Expand Down Expand Up @@ -787,7 +787,7 @@ def build_train_args(self, batch, batch_idx, opt_idx, hiddens):
def save_loggers_on_train_batch_end(self):
# when loggers should save to disk
should_save_log = (
(self.trainer.global_step + 1) % self.trainer.log_save_interval == 0 or self.trainer.should_stop
(self.trainer.global_step + 1) % self.trainer.flush_logs_every_n_steps == 0 or self.trainer.should_stop
)
if should_save_log or self.trainer.fast_dev_run:
if self.trainer.is_global_zero and self.trainer.logger is not None:
Expand Down
10 changes: 5 additions & 5 deletions tests/models/test_grad_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3):
default_root_dir=tmpdir,
max_epochs=3,
track_grad_norm=norm_type,
row_log_interval=1, # request grad_norms every batch
log_every_n_steps=1, # request grad_norms every batch
)
result = trainer.fit(model)

Expand All @@ -76,20 +76,20 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3):
assert np.allclose(log, mod, rtol=rtol)


@pytest.mark.parametrize("row_log_interval", [1, 2, 3])
def test_grad_tracking_interval(tmpdir, row_log_interval):
@pytest.mark.parametrize("log_every_n_steps", [1, 2, 3])
def test_grad_tracking_interval(tmpdir, log_every_n_steps):
""" Test that gradient norms get tracked in the right interval and that everytime the same keys get logged. """
trainer = Trainer(
default_root_dir=tmpdir,
track_grad_norm=2,
row_log_interval=row_log_interval,
log_every_n_steps=log_every_n_steps,
max_steps=10,
)

with patch.object(trainer.logger, "log_metrics") as mocked:
model = EvalModelTemplate()
trainer.fit(model)
expected = trainer.global_step // row_log_interval
expected = trainer.global_step // log_every_n_steps
grad_norm_dicts = []
for _, kwargs in mocked.call_args_list:
metrics = kwargs.get("metrics", {})
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def test_result_obj_on_tpu(tmpdir):
default_root_dir=tmpdir,
max_epochs=epochs,
early_stop_callback=True,
row_log_interval=2,
log_every_n_steps=2,
limit_train_batches=batches,
weights_summary=None,
tpu_cores=8
Expand Down
2 changes: 1 addition & 1 deletion tests/trainer/test_correct_freq_accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def test_training_step_scalar(tmpdir):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down
8 changes: 4 additions & 4 deletions tests/trainer/test_eval_loop_flow_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -90,7 +90,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -147,7 +147,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)

Expand Down Expand Up @@ -211,7 +211,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)

Expand Down
4 changes: 2 additions & 2 deletions tests/trainer/test_eval_loop_logging_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -108,7 +108,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down
8 changes: 4 additions & 4 deletions tests/trainer/test_train_loop_flow_dict_1_0.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -73,7 +73,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -121,7 +121,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down Expand Up @@ -175,7 +175,7 @@ def backward(self, trainer, loss, optimizer, optimizer_idx):
limit_train_batches=2,
limit_val_batches=2,
max_epochs=2,
row_log_interval=1,
log_every_n_steps=1,
weights_summary=None,
)
trainer.fit(model)
Expand Down
Loading