Skip to content

Commit

Permalink
Merge branch 'master' into enhance/on_load_dm
Browse files Browse the repository at this point in the history
  • Loading branch information
tchaton authored Oct 29, 2021
2 parents e5d2ac1 + 81d15c5 commit e3dd054
Show file tree
Hide file tree
Showing 21 changed files with 181 additions and 239 deletions.
15 changes: 12 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added support for `torch.autograd.set_detect_anomaly` through `Trainer` constructor argument `detect_anomaly` ([#9848](https://github.com/PyTorchLightning/pytorch-lightning/pull/9848))


- Added a `len` method to `LightningDataModule` ([#9895](https://github.com/PyTorchLightning/pytorch-lightning/pull/9895))


- Added `enable_model_summary` flag to Trainer ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699))


Expand Down Expand Up @@ -356,6 +353,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Changed default value of the `max_steps` Trainer argument from `None` to -1 ([#9460](https://github.com/PyTorchLightning/pytorch-lightning/pull/9460))


- Raise an error when calling `log(on_step=False, on_epoch=False)` ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227))


- Disable quantization aware training observers by default during validating/testing/predicting stages ([#8540](https://github.com/PyTorchLightning/pytorch-lightning/pull/8540))


Expand Down Expand Up @@ -581,6 +581,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Fixed

- Fixed imagenet example evaluation ([#10179](https://github.com/PyTorchLightning/pytorch-lightning/pull/10179))

- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685))

Expand Down Expand Up @@ -668,6 +669,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed gradients not being unscaled when clipping or logging the gradient norm ([#9287](https://github.com/PyTorchLightning/pytorch-lightning/pull/9287))


- Fixed `on_before_optimizer_step` getting called before the optimizer closure (including backward) has run ([#10167](https://github.com/PyTorchLightning/pytorch-lightning/pull/10167))


- Fixed monitor value in `ModelCheckpoint` getting moved to the wrong device in a special case where it becomes NaN ([#10118](https://github.com/PyTorchLightning/pytorch-lightning/pull/10118))


Expand All @@ -677,12 +681,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed incorrect handling of sigterm ([#10189](https://github.com/PyTorchLightning/pytorch-lightning/pull/10189))


- Fixed bug where `log(on_step=True, on_epoch=True, sync_dist=True)` wouldn't reduce the value on step ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227))


- Fixed an issue with `pl.utilities.seed.reset_seed` converting the `PL_SEED_WORKERS` environment variable to `bool` ([#10099](https://github.com/PyTorchLightning/pytorch-lightning/pull/10099))


- Fixed iterating over a logger collection when `fast_dev_run > 0` ([#10232](https://github.com/PyTorchLightning/pytorch-lightning/pull/10232))


- Fixed `batch_size` in `ResultCollection` not being reset to 1 on epoch end ([#10242](https://github.com/PyTorchLightning/pytorch-lightning/pull/10242))


## [1.4.9] - 2021-09-30

Expand Down
28 changes: 9 additions & 19 deletions pl_examples/domain_templates/imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,17 @@ def training_step(self, batch, batch_idx):
self.log("train_acc5", acc5, on_step=True, on_epoch=True, logger=True)
return loss_train

def validation_step(self, batch, batch_idx):
def eval_step(self, batch, batch_idx, prefix: str):
images, target = batch
output = self(images)
loss_val = F.cross_entropy(output, target)
acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
self.log("val_loss", loss_val, on_step=True, on_epoch=True)
self.log("val_acc1", acc1, on_step=True, prog_bar=True, on_epoch=True)
self.log("val_acc5", acc5, on_step=True, on_epoch=True)
self.log(f"{prefix}_loss", loss_val, on_step=True, on_epoch=True)
self.log(f"{prefix}_acc1", acc1, on_step=True, prog_bar=True, on_epoch=True)
self.log(f"{prefix}_acc5", acc5, on_step=True, on_epoch=True)

def validation_step(self, batch, batch_idx):
return self.eval_step(batch, batch_idx, "val")

@staticmethod
def __accuracy(output, target, topk=(1,)):
Expand Down Expand Up @@ -165,21 +168,8 @@ def val_dataloader(self):
def test_dataloader(self):
return self.val_dataloader()

def test_step(self, *args, **kwargs):
return self.validation_step(*args, **kwargs)

def test_epoch_end(self, *args, **kwargs):
outputs = self.validation_epoch_end(*args, **kwargs)

def substitute_val_keys(out):
return {k.replace("val", "test"): v for k, v in out.items()}

outputs = {
"test_loss": outputs["val_loss"],
"progress_bar": substitute_val_keys(outputs["progress_bar"]),
"log": substitute_val_keys(outputs["log"]),
}
return outputs
def test_step(self, batch, batch_idx):
return self.eval_step(batch, batch_idx, "test")

@staticmethod
def add_model_specific_args(parent_parser): # pragma: no-cover
Expand Down
11 changes: 8 additions & 3 deletions pytorch_lightning/accelerators/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def optimizer_step(
self,
optimizer: Optimizer,
opt_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
model: Optional[Union["pl.LightningModule", Module]] = None,
**kwargs: Any
) -> None:
Expand All @@ -327,12 +327,17 @@ def optimizer_step(
Args:
optimizer: the optimizer performing the step
opt_idx: index of the current optimizer
lambda_closure: closure calculating the loss value
closure: closure calculating the loss value
model: reference to the model, optionally defining optimizer step related hooks
**kwargs: Any extra arguments to ``optimizer.step``
"""
model = model or self.lightning_module
self.precision_plugin.optimizer_step(model, optimizer, opt_idx, lambda_closure, **kwargs)
self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)

if not isinstance(model, pl.LightningModule):
# gradient clipping and norm tracking only available with a LightingModule/Trainer
return

trainer = model.trainer
assert isinstance(trainer, pl.Trainer)
# TODO: this is done for the entire model but should be changed to per-optimizer
Expand Down
40 changes: 0 additions & 40 deletions pytorch_lightning/core/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
from pytorch_lightning.core.mixins import HyperparametersMixin
from pytorch_lightning.utilities import rank_zero_deprecation
from pytorch_lightning.utilities.apply_func import apply_to_collection
from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types
from pytorch_lightning.utilities.data import has_len
from pytorch_lightning.utilities.warnings import rank_zero_warn


class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin):
Expand Down Expand Up @@ -484,40 +481,3 @@ def __getstate__(self) -> dict:
for fn in ("prepare_data", "setup", "teardown"):
del d[fn]
return d

def __len__(self) -> int:
"""Returns the total number of batches in all dataloaders defined in the datamodule."""

from pytorch_lightning.trainer.supporters import CombinedLoader

num_batches = 0
not_implemented_count = 0

def get_num_batches(dataloader: DataLoader, name: str) -> None:
nonlocal num_batches
if not has_len(dataloader):
rank_zero_warn(
f"The number of batches for a dataloader in `{name}` is counted as 0 "
"because it does not have `__len__` defined."
)
else:
num_batches += len(dataloader)

for method_name in ("train_dataloader", "val_dataloader", "test_dataloader", "predict_dataloader"):
dataloader_method = getattr(self, method_name)
if not callable(dataloader_method):
not_implemented_count += 1
continue
try:
dataloader = dataloader_method()
except NotImplementedError:
not_implemented_count += 1
continue
if isinstance(dataloader, CombinedLoader):
dataloader = dataloader.loaders
apply_to_collection(dataloader, DataLoader, get_num_batches, method_name)

if not_implemented_count == 4:
rank_zero_warn("You datamodule does not have any valid dataloader so `__len__` will be returned as 0.")

return num_batches
4 changes: 2 additions & 2 deletions pytorch_lightning/plugins/precision/apex_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ def optimizer_step(
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
if isinstance(optimizer, LBFGS):
raise MisconfigurationException(
f"apex AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
)
closure_result = lambda_closure()
closure_result = closure()
if isinstance(model, pl.LightningModule):
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
skipped_backward = closure_result is None
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/plugins/precision/deepspeed_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ def optimizer_step(
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
if isinstance(optimizer, LBFGS):
raise MisconfigurationException(
f"DeepSpeed and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
)
closure_result = lambda_closure()
closure_result = closure()
if isinstance(model, pl.LightningModule):
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
skipped_backward = closure_result is None
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/plugins/precision/ipu_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def optimizer_step(
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
"""IPUs handle the optimizer step internally."""
if isinstance(optimizer, LBFGS):
raise MisconfigurationException(
f"IPUs and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
)
closure_result = lambda_closure()
closure_result = closure()
if isinstance(model, pl.LightningModule):
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
skipped_backward = closure_result is None
Expand Down
6 changes: 3 additions & 3 deletions pytorch_lightning/plugins/precision/native_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,17 @@ def optimizer_step(
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
if self.scaler is None:
# skip scaler logic, as bfloat16 does not require scaler
return super().optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
return super().optimizer_step(model, optimizer, optimizer_idx, closure, **kwargs)
if isinstance(optimizer, LBFGS):
raise MisconfigurationException(
f"Native AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
)
closure_result = lambda_closure()
closure_result = closure()
# `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
self.scaler.unscale_(optimizer)
if isinstance(model, pl.LightningModule):
Expand Down
26 changes: 22 additions & 4 deletions pytorch_lightning/plugins/precision/precision_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from functools import partial
from typing import Any, Callable, Generator, List, Optional, Tuple, Union

import torch
Expand Down Expand Up @@ -110,21 +111,38 @@ def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **k
"""
tensor.backward(*args, **kwargs)

def _wrap_closure(
self,
model: "pl.LightningModule",
optimizer: Optimizer,
optimizer_idx: int,
closure: Callable[[], Any],
) -> Any:
"""This double-closure allows makes sure the ``closure`` is executed before the
``on_before_optimizer_step`` hook is called.
The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is
consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
"""
closure_result = closure()
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
return closure_result

def optimizer_step(
self,
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any,
) -> None:
"""Hook to run the optimizer step."""
if isinstance(model, pl.LightningModule):
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
optimizer.step(closure=lambda_closure, **kwargs)
closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
optimizer.step(closure=closure, **kwargs)

def _track_grad_norm(self, trainer: "pl.Trainer") -> None:
if float(trainer.track_grad_norm) == -1:
if trainer.track_grad_norm == -1:
return
grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, trainer.logger.group_separator)
if grad_norm_dict:
Expand Down
7 changes: 4 additions & 3 deletions pytorch_lightning/plugins/precision/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from functools import partial
from typing import Any, Callable, Union

from torch.nn import Module
Expand All @@ -31,12 +32,12 @@ def optimizer_step(
model: Union["pl.LightningModule", Module],
optimizer: Optimizer,
optimizer_idx: int,
lambda_closure: Callable[[], Any],
closure: Callable[[], Any],
**kwargs: Any
) -> None:
if isinstance(model, pl.LightningModule):
model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
closure_result = xm.optimizer_step(optimizer, optimizer_args={"closure": lambda_closure, **kwargs})
closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
closure_result = xm.optimizer_step(optimizer, optimizer_args={"closure": closure, **kwargs})
skipped_backward = closure_result is None
# in manual optimization, the closure does not return a value
if isinstance(model, pl.LightningModule) and model.automatic_optimization and skipped_backward:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,8 @@ def _set_devices_if_none(self) -> None:
self.devices = self.num_processes

def _handle_accelerator_and_strategy(self) -> None:
if self.distributed_backend is not None and self.distributed_backend in list(DistributedType):
deprecated_types = [t for t in DistributedType if t not in (DistributedType.TPU_SPAWN, DistributedType.DDP_CPU)]
if self.distributed_backend is not None and self.distributed_backend in deprecated_types:
rank_zero_deprecation(
f"Passing `Trainer(accelerator={self.distributed_backend!r})` has been deprecated"
f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={self.distributed_backend!r})` instead."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,8 @@ def epoch_end_reached(self) -> None:
self._epoch_end_reached = True
self._batch_idx = None
self._split_idx = None
assert self.trainer._results is not None
self.trainer._results.batch_size = 1

def on_epoch_end(self) -> None:
assert self._epoch_end_reached
Expand Down
11 changes: 8 additions & 3 deletions pytorch_lightning/trainer/connectors/logger_connector/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ class _Metadata:
_sync: Optional[_Sync] = None

def __post_init__(self) -> None:
if not self.on_step and not self.on_epoch:
raise MisconfigurationException("`self.log(on_step=False, on_epoch=False)` is not useful.")
self._parse_reduce_fx()

def _parse_reduce_fx(self) -> None:
Expand Down Expand Up @@ -192,11 +194,14 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None:
def update(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None:
if self.is_tensor:
value = value.float()
if self.meta.on_step:
self._forward_cache = self.meta.sync(value.clone()) # `clone` because `sync` is in-place

# performance: no need to accumulate on values only logged on_step
if self.meta.on_step and not self.meta.on_epoch:
self._forward_cache = self.value = self.meta.sync(value)
if not self.meta.on_epoch:
self.value = self._forward_cache
return
self._forward_cache = value

# perform accumulation with reduction
if self.meta.is_mean_reduction:
self.value += value.mean() * batch_size
Expand Down
Loading

0 comments on commit e3dd054

Please sign in to comment.