Merge branch 'master' into enhance/on_load_dm

Lightning-AI · Oct 29, 2021 · e3dd054 · e3dd054
2 parents e5d2ac1 + 81d15c5
commit e3dd054
Show file tree

Hide file tree

Showing 21 changed files with 181 additions and 239 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -187,9 +187,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added support for `torch.autograd.set_detect_anomaly` through `Trainer` constructor argument `detect_anomaly` ([#9848](https://github.com/PyTorchLightning/pytorch-lightning/pull/9848))
 
 
-- Added a `len` method to `LightningDataModule` ([#9895](https://github.com/PyTorchLightning/pytorch-lightning/pull/9895))
-
-
 - Added `enable_model_summary` flag to Trainer ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699))
 
 
@@ -356,6 +353,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed default value of the `max_steps` Trainer argument from `None` to -1 ([#9460](https://github.com/PyTorchLightning/pytorch-lightning/pull/9460))
 
 
+- Raise an error when calling `log(on_step=False, on_epoch=False)` ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227))
+
+
 - Disable quantization aware training observers by default during validating/testing/predicting stages ([#8540](https://github.com/PyTorchLightning/pytorch-lightning/pull/8540))
 
 
@@ -581,6 +581,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed imagenet example evaluation ([#10179](https://github.com/PyTorchLightning/pytorch-lightning/pull/10179))
 
 - Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685))
 
@@ -668,6 +669,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed gradients not being unscaled when clipping or logging the gradient norm ([#9287](https://github.com/PyTorchLightning/pytorch-lightning/pull/9287))
 
 
+- Fixed `on_before_optimizer_step` getting called before the optimizer closure (including backward) has run ([#10167](https://github.com/PyTorchLightning/pytorch-lightning/pull/10167))
+
+
 - Fixed monitor value in `ModelCheckpoint` getting moved to the wrong device in a special case where it becomes NaN ([#10118](https://github.com/PyTorchLightning/pytorch-lightning/pull/10118))
 
 
@@ -677,12 +681,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed incorrect handling of sigterm ([#10189](https://github.com/PyTorchLightning/pytorch-lightning/pull/10189))
 
 
+- Fixed bug where `log(on_step=True, on_epoch=True, sync_dist=True)` wouldn't reduce the value on step ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227))
+
+
 - Fixed an issue with `pl.utilities.seed.reset_seed` converting the `PL_SEED_WORKERS` environment variable to `bool` ([#10099](https://github.com/PyTorchLightning/pytorch-lightning/pull/10099))
 
 
 - Fixed iterating over a logger collection when `fast_dev_run > 0` ([#10232](https://github.com/PyTorchLightning/pytorch-lightning/pull/10232))
 
 
+- Fixed `batch_size` in `ResultCollection` not being reset to 1 on epoch end ([#10242](https://github.com/PyTorchLightning/pytorch-lightning/pull/10242))
+
 
 ## [1.4.9] - 2021-09-30
 

diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
@@ -99,14 +99,17 @@ def training_step(self, batch, batch_idx):
         self.log("train_acc5", acc5, on_step=True, on_epoch=True, logger=True)
         return loss_train
 
-    def validation_step(self, batch, batch_idx):
+    def eval_step(self, batch, batch_idx, prefix: str):
         images, target = batch
         output = self(images)
         loss_val = F.cross_entropy(output, target)
         acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
-        self.log("val_loss", loss_val, on_step=True, on_epoch=True)
-        self.log("val_acc1", acc1, on_step=True, prog_bar=True, on_epoch=True)
-        self.log("val_acc5", acc5, on_step=True, on_epoch=True)
+        self.log(f"{prefix}_loss", loss_val, on_step=True, on_epoch=True)
+        self.log(f"{prefix}_acc1", acc1, on_step=True, prog_bar=True, on_epoch=True)
+        self.log(f"{prefix}_acc5", acc5, on_step=True, on_epoch=True)
+
+    def validation_step(self, batch, batch_idx):
+        return self.eval_step(batch, batch_idx, "val")
 
     @staticmethod
     def __accuracy(output, target, topk=(1,)):
@@ -165,21 +168,8 @@ def val_dataloader(self):
     def test_dataloader(self):
         return self.val_dataloader()
 
-    def test_step(self, *args, **kwargs):
-        return self.validation_step(*args, **kwargs)
-
-    def test_epoch_end(self, *args, **kwargs):
-        outputs = self.validation_epoch_end(*args, **kwargs)
-
-        def substitute_val_keys(out):
-            return {k.replace("val", "test"): v for k, v in out.items()}
-
-        outputs = {
-            "test_loss": outputs["val_loss"],
-            "progress_bar": substitute_val_keys(outputs["progress_bar"]),
-            "log": substitute_val_keys(outputs["log"]),
-        }
-        return outputs
+    def test_step(self, batch, batch_idx):
+        return self.eval_step(batch, batch_idx, "test")
 
     @staticmethod
     def add_model_specific_args(parent_parser):  # pragma: no-cover

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -318,7 +318,7 @@ def optimizer_step(
         self,
         optimizer: Optimizer,
         opt_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         model: Optional[Union["pl.LightningModule", Module]] = None,
         **kwargs: Any
     ) -> None:
@@ -327,12 +327,17 @@ def optimizer_step(
         Args:
             optimizer: the optimizer performing the step
             opt_idx: index of the current optimizer
-            lambda_closure: closure calculating the loss value
+            closure: closure calculating the loss value
             model: reference to the model, optionally defining optimizer step related hooks
             **kwargs: Any extra arguments to ``optimizer.step``
         """
         model = model or self.lightning_module
-        self.precision_plugin.optimizer_step(model, optimizer, opt_idx, lambda_closure, **kwargs)
+        self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
+
+        if not isinstance(model, pl.LightningModule):
+            # gradient clipping and norm tracking only available with a LightingModule/Trainer
+            return
+
         trainer = model.trainer
         assert isinstance(trainer, pl.Trainer)
         # TODO: this is done for the entire model but should be changed to per-optimizer

diff --git a/pytorch_lightning/core/datamodule.py b/pytorch_lightning/core/datamodule.py
@@ -22,10 +22,7 @@
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks
 from pytorch_lightning.core.mixins import HyperparametersMixin
 from pytorch_lightning.utilities import rank_zero_deprecation
-from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.argparse import add_argparse_args, from_argparse_args, get_init_arguments_and_types
-from pytorch_lightning.utilities.data import has_len
-from pytorch_lightning.utilities.warnings import rank_zero_warn
 
 
 class LightningDataModule(CheckpointHooks, DataHooks, HyperparametersMixin):
@@ -484,40 +481,3 @@ def __getstate__(self) -> dict:
         for fn in ("prepare_data", "setup", "teardown"):
             del d[fn]
         return d
-
-    def __len__(self) -> int:
-        """Returns the total number of batches in all dataloaders defined in the datamodule."""
-
-        from pytorch_lightning.trainer.supporters import CombinedLoader
-
-        num_batches = 0
-        not_implemented_count = 0
-
-        def get_num_batches(dataloader: DataLoader, name: str) -> None:
-            nonlocal num_batches
-            if not has_len(dataloader):
-                rank_zero_warn(
-                    f"The number of batches for a dataloader in `{name}` is counted as 0 "
-                    "because it does not have `__len__` defined."
-                )
-            else:
-                num_batches += len(dataloader)
-
-        for method_name in ("train_dataloader", "val_dataloader", "test_dataloader", "predict_dataloader"):
-            dataloader_method = getattr(self, method_name)
-            if not callable(dataloader_method):
-                not_implemented_count += 1
-                continue
-            try:
-                dataloader = dataloader_method()
-            except NotImplementedError:
-                not_implemented_count += 1
-                continue
-            if isinstance(dataloader, CombinedLoader):
-                dataloader = dataloader.loaders
-            apply_to_collection(dataloader, DataLoader, get_num_batches, method_name)
-
-        if not_implemented_count == 4:
-            rank_zero_warn("You datamodule does not have any valid dataloader so `__len__` will be returned as 0.")
-
-        return num_batches
diff --git a/pytorch_lightning/plugins/precision/apex_amp.py b/pytorch_lightning/plugins/precision/apex_amp.py
@@ -101,14 +101,14 @@ def optimizer_step(
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any,
     ) -> None:
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"apex AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
             )
-        closure_result = lambda_closure()
+        closure_result = closure()
         if isinstance(model, pl.LightningModule):
             model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
         skipped_backward = closure_result is None

diff --git a/pytorch_lightning/plugins/precision/deepspeed_precision.py b/pytorch_lightning/plugins/precision/deepspeed_precision.py
@@ -55,14 +55,14 @@ def optimizer_step(
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any,
     ) -> None:
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"DeepSpeed and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
             )
-        closure_result = lambda_closure()
+        closure_result = closure()
         if isinstance(model, pl.LightningModule):
             model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
         skipped_backward = closure_result is None

diff --git a/pytorch_lightning/plugins/precision/ipu_precision.py b/pytorch_lightning/plugins/precision/ipu_precision.py
@@ -43,15 +43,15 @@ def optimizer_step(
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any,
     ) -> None:
         """IPUs handle the optimizer step internally."""
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"IPUs and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
             )
-        closure_result = lambda_closure()
+        closure_result = closure()
         if isinstance(model, pl.LightningModule):
             model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
         skipped_backward = closure_result is None

diff --git a/pytorch_lightning/plugins/precision/native_amp.py b/pytorch_lightning/plugins/precision/native_amp.py
@@ -72,17 +72,17 @@ def optimizer_step(
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any,
     ) -> None:
         if self.scaler is None:
             # skip scaler logic, as bfloat16 does not require scaler
-            return super().optimizer_step(model, optimizer, optimizer_idx, lambda_closure, **kwargs)
+            return super().optimizer_step(model, optimizer, optimizer_idx, closure, **kwargs)
         if isinstance(optimizer, LBFGS):
             raise MisconfigurationException(
                 f"Native AMP and the LBFGS optimizer are not compatible (optimizer {optimizer_idx})."
             )
-        closure_result = lambda_closure()
+        closure_result = closure()
         # `unscale` after the closure is executed but before the `on_before_optimizer_step` hook.
         self.scaler.unscale_(optimizer)
         if isinstance(model, pl.LightningModule):

diff --git a/pytorch_lightning/plugins/precision/precision_plugin.py b/pytorch_lightning/plugins/precision/precision_plugin.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+from functools import partial
 from typing import Any, Callable, Generator, List, Optional, Tuple, Union
 
 import torch
@@ -110,21 +111,38 @@ def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **k
         """
         tensor.backward(*args, **kwargs)
 
+    def _wrap_closure(
+        self,
+        model: "pl.LightningModule",
+        optimizer: Optimizer,
+        optimizer_idx: int,
+        closure: Callable[[], Any],
+    ) -> Any:
+        """This double-closure allows makes sure the ``closure`` is executed before the
+        ``on_before_optimizer_step`` hook is called.
+
+        The closure (generally) runs ``backward`` so this allows inspecting gradients in this hook. This structure is
+        consistent with the ``PrecisionPlugin`` subclasses that cannot pass ``optimizer.step(closure)`` directly.
+        """
+        closure_result = closure()
+        model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
+        return closure_result
+
     def optimizer_step(
         self,
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any,
     ) -> None:
         """Hook to run the optimizer step."""
         if isinstance(model, pl.LightningModule):
-            model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
-        optimizer.step(closure=lambda_closure, **kwargs)
+            closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
+        optimizer.step(closure=closure, **kwargs)
 
     def _track_grad_norm(self, trainer: "pl.Trainer") -> None:
-        if float(trainer.track_grad_norm) == -1:
+        if trainer.track_grad_norm == -1:
             return
         grad_norm_dict = grad_norm(trainer.lightning_module, trainer.track_grad_norm, trainer.logger.group_separator)
         if grad_norm_dict:

diff --git a/pytorch_lightning/plugins/precision/tpu.py b/pytorch_lightning/plugins/precision/tpu.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import partial
 from typing import Any, Callable, Union
 
 from torch.nn import Module
@@ -31,12 +32,12 @@ def optimizer_step(
         model: Union["pl.LightningModule", Module],
         optimizer: Optimizer,
         optimizer_idx: int,
-        lambda_closure: Callable[[], Any],
+        closure: Callable[[], Any],
         **kwargs: Any
     ) -> None:
         if isinstance(model, pl.LightningModule):
-            model.trainer.call_hook("on_before_optimizer_step", optimizer, optimizer_idx)
-        closure_result = xm.optimizer_step(optimizer, optimizer_args={"closure": lambda_closure, **kwargs})
+            closure = partial(self._wrap_closure, model, optimizer, optimizer_idx, closure)
+        closure_result = xm.optimizer_step(optimizer, optimizer_args={"closure": closure, **kwargs})
         skipped_backward = closure_result is None
         # in manual optimization, the closure does not return a value
         if isinstance(model, pl.LightningModule) and model.automatic_optimization and skipped_backward:

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -277,7 +277,8 @@ def _set_devices_if_none(self) -> None:
             self.devices = self.num_processes
 
     def _handle_accelerator_and_strategy(self) -> None:
-        if self.distributed_backend is not None and self.distributed_backend in list(DistributedType):
+        deprecated_types = [t for t in DistributedType if t not in (DistributedType.TPU_SPAWN, DistributedType.DDP_CPU)]
+        if self.distributed_backend is not None and self.distributed_backend in deprecated_types:
             rank_zero_deprecation(
                 f"Passing `Trainer(accelerator={self.distributed_backend!r})` has been deprecated"
                 f" in v1.5 and will be removed in v1.7. Use `Trainer(strategy={self.distributed_backend!r})` instead."

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -273,6 +273,8 @@ def epoch_end_reached(self) -> None:
         self._epoch_end_reached = True
         self._batch_idx = None
         self._split_idx = None
+        assert self.trainer._results is not None
+        self.trainer._results.batch_size = 1
 
     def on_epoch_end(self) -> None:
         assert self._epoch_end_reached

diff --git a/pytorch_lightning/trainer/connectors/logger_connector/result.py b/pytorch_lightning/trainer/connectors/logger_connector/result.py
@@ -98,6 +98,8 @@ class _Metadata:
     _sync: Optional[_Sync] = None
 
     def __post_init__(self) -> None:
+        if not self.on_step and not self.on_epoch:
+            raise MisconfigurationException("`self.log(on_step=False, on_epoch=False)` is not useful.")
         self._parse_reduce_fx()
 
     def _parse_reduce_fx(self) -> None:
@@ -192,11 +194,14 @@ def __init__(self, metadata: _Metadata, is_tensor: bool) -> None:
     def update(self, value: _IN_METRIC, batch_size: torch.Tensor) -> None:
         if self.is_tensor:
             value = value.float()
+            if self.meta.on_step:
+                self._forward_cache = self.meta.sync(value.clone())  # `clone` because `sync` is in-place
+
             # performance: no need to accumulate on values only logged on_step
-            if self.meta.on_step and not self.meta.on_epoch:
-                self._forward_cache = self.value = self.meta.sync(value)
+            if not self.meta.on_epoch:
+                self.value = self._forward_cache
                 return
-            self._forward_cache = value
+
             # perform accumulation with reduction
             if self.meta.is_mean_reduction:
                 self.value += value.mean() * batch_size