Merge branch 'master' into bugfix/6929-missing-lightningmodule-datamo…

…dule-reference
Lightning-AI · May 3, 2021 · fd60e39 · fd60e39
2 parents f85a202 + 6d7c6d6
commit fd60e39
Showing 19 changed files with 751 additions and 547 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -8,15 +8,15 @@
 * @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11
 
 # CI/CD and configs
-/.github/       @borda @tchaton
-/dockers/       @borda @tchaton
-*.yml           @borda @tchaton
+/.github/       @borda @tchaton @carmocca
+/dockers/       @borda @tchaton @carmocca
+*.yml           @borda @tchaton @carmocca
 
 # Docs
 /docs/                      @edenlightning @tchaton @borda @awaelchli
 /.github/*.md               @edenlightning @williamfalcon @borda
 /.github/ISSUE_TEMPLATE/    @edenlightning @borda @tchaton
-/docs/source/conf.py        @borda @awaelchli
+/docs/source/conf.py        @borda @awaelchli @carmocca
 
 # Packages
 /pytorch_lightning/accelerators         @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
@@ -39,11 +39,11 @@
 /docs/source/metrics.rst                @SkafteNicki @ananyahjha93 @justusschock
 
 # API
-/pytorch_lightning/callbacks/base.py    @williamfalcon
-/pytorch_lightning/core/datamodule.py   @williamFalcon
-/pytorch_lightning/trainer/trainer.py   @williamfalcon @tchaton
-/pytorch_lightning/core/hooks.py        @williamfalcon
-/pytorch_lightning/core/lightning.py    @williamfalcon @tchaton
+/pytorch_lightning/callbacks/base.py    @williamfalcon @awaelchli @ananthsub @carmocca
+/pytorch_lightning/core/datamodule.py   @williamFalcon @awaelchli @ananthsub @carmocca
+/pytorch_lightning/trainer/trainer.py   @williamfalcon @tchaton @awaelchli
+/pytorch_lightning/core/hooks.py        @williamfalcon @tchaton @awaelchli @ananthsub @carmocca
+/pytorch_lightning/core/lightning.py    @williamfalcon @tchaton @awaelchli
 
 # Testing
 /tests/helpers/boring_model.py          @williamfalcon @tchaton @borda

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -145,6 +145,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added warning when missing `Callback` and using `resume_from_checkpoint` ([#7254](https://github.com/PyTorchLightning/pytorch-lightning/pull/7254))
 
 
+- Improved verbose logging for `EarlyStopping` callback ([#6811](https://github.com/PyTorchLightning/pytorch-lightning/pull/6811))
+
+
 ### Changed
 
 
@@ -299,6 +302,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ### Fixed
 
 
+- Fixed NaN errors in progress bars when training with iterable datasets with no length defined ([#7306](https://github.com/PyTorchLightning/pytorch-lightning/pull/7306))
+
+
+- Fixed validation being skipped for iterable datasets with no length defined ([#7306](https://github.com/PyTorchLightning/pytorch-lightning/pull/7306))
+
+
 - Fixed attaching train and validation dataloaders when `reload_dataloaders_every_epoch=True` and `num_sanity_val_steps=0` ([#7207](https://github.com/PyTorchLightning/pytorch-lightning/pull/7207))
 
 

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -118,8 +118,8 @@ RUN \
 
 RUN \
     # install DeepSpeed
-    # TODO(@SeanNaren): 0.3.15 is broken - skipping to unblock
-    pip install 'deepspeed>=0.3.14,!=0.3.15'
+    # TODO(@SeanNaren): CI failing with `>=0.3.15` - skipping to unblock
+    pip install deepspeed==0.3.14
 
 RUN \
     # Show what we have

diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst
@@ -14,18 +14,14 @@
         def __init__(
             self,
             encoder_layers: int = 12,
-            decoder_layers: List[int] = [2, 4]
+            decoder_layers: List[int] = [2, 4],
+            batch_size: int = 8,
         ):
-            """Example encoder-decoder model
-
-            Args:
-                encoder_layers: Number of layers for the encoder
-                decoder_layers: Number of layers for each decoder block
-            """
             pass
 
     class MyDataModule(LightningDataModule):
-        pass
+        def __init__(self, batch_size: int = 8):
+            pass
 
     def send_email(address, message):
         pass
@@ -119,7 +115,7 @@ The start of a possible implementation of :class:`MyModel` including the recomme
 docstring could be the one below. Note that by using type hints and docstrings there is no need to duplicate this
 information to define its configurable arguments.
 
-.. testcode::
+.. testcode:: mymodel
 
     class MyModel(LightningModule):
 
@@ -373,8 +369,46 @@ before and after the execution of fit. The code would be something like:
     cli = MyLightningCLI(MyModel)
 
 Note that the config object :code:`self.config` is a dictionary whose keys are global options or groups of options. It
-has the same structure as the yaml format as described previously. This means for instance that the parameters used for
+has the same structure as the yaml format described previously. This means for instance that the parameters used for
 instantiating the trainer class can be found in :code:`self.config['trainer']`.
 
-For more advanced use cases, other methods of the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class could be
-extended. For further information have a look at the corresponding API reference.
+Another case in which it might be desired to extend :class:`~pytorch_lightning.utilities.cli.LightningCLI` is that the
+model and data module depend on a common parameter. For example in some cases both classes require to know the
+:code:`batch_size`. It is a burden and error prone giving the same value twice in a config file. To avoid this the
+parser can be configured so that a value is only given once and then propagated accordingly. With a tool implemented
+like shown below, the :code:`batch_size` only has to be provided in the :code:`data` section of the config.
+
+.. testcode::
+
+    from pytorch_lightning.utilities.cli import LightningCLI
+
+    class MyLightningCLI(LightningCLI):
+
+        def add_arguments_to_parser(self, parser):
+            parser.link_arguments('data.batch_size', 'model.batch_size')
+
+    cli = MyLightningCLI(MyModel, MyDataModule)
+
+The linking of arguments is observed in the help of the tool, which for this example would look like:
+
+.. code-block:: bash
+
+    $ python trainer.py --help
+      ...
+        --data.batch_size BATCH_SIZE
+                              Number of samples in a batch (type: int, default: 8)
+
+      Linked arguments:
+        model.batch_size <-- data.batch_size
+                              Number of samples in a batch (type: int)
+
+.. tip::
+
+    The linking of arguments can be used for more complex cases. For example to derive a value via a function that takes
+    multiple settings as input. For more details have a look at the API of `link_arguments
+    <https://jsonargparse.readthedocs.io/en/stable/#jsonargparse.core.ArgumentParser.link_arguments>`_.
+
+.. tip::
+
+    Have a look at the :class:`~pytorch_lightning.utilities.cli.LightningCLI` class API reference to learn about other
+    methods that can be extended to customize a CLI.
diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -37,9 +37,8 @@
 Note:
     See: https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
 """
-import argparse
+
 import logging
-import os
 from pathlib import Path
 from typing import Union
 
@@ -59,6 +58,7 @@
 from pytorch_lightning import LightningDataModule
 from pytorch_lightning.callbacks.finetuning import BaseFinetuning
 from pytorch_lightning.utilities import rank_zero_info
+from pytorch_lightning.utilities.cli import LightningCLI
 
 log = logging.getLogger(__name__)
 DATA_URL = "https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip"
@@ -93,10 +93,17 @@ class CatDogImageDataModule(LightningDataModule):
 
     def __init__(
         self,
-        dl_path: Union[str, Path],
+        dl_path: Union[str, Path] = "data",
         num_workers: int = 0,
         batch_size: int = 8,
     ):
+        """CatDogImageDataModule
+
+        Args:
+            dl_path: root directory where to download the data
+            num_workers: number of CPU workers
+            batch_size: number of sample in a batch
+        """
         super().__init__()
 
         self._dl_path = dl_path
@@ -146,17 +153,6 @@ def val_dataloader(self):
         log.info("Validation data loaded.")
         return self.__dataloader(train=False)
 
-    @staticmethod
-    def add_model_specific_args(parent_parser):
-        parser = parent_parser.add_argument_group("CatDogImageDataModule")
-        parser.add_argument(
-            "--num-workers", default=0, type=int, metavar="W", help="number of CPU workers", dest="num_workers"
-        )
-        parser.add_argument(
-            "--batch-size", default=8, type=int, metavar="W", help="number of sample in a batch", dest="batch_size"
-        )
-        return parent_parser
-
 
 #  --- Pytorch-lightning module ---
 
@@ -166,17 +162,22 @@ class TransferLearningModel(pl.LightningModule):
     def __init__(
         self,
         backbone: str = "resnet50",
-        train_bn: bool = True,
-        milestones: tuple = (5, 10),
+        train_bn: bool = False,
+        milestones: tuple = (2, 4),
         batch_size: int = 32,
-        lr: float = 1e-2,
+        lr: float = 1e-3,
         lr_scheduler_gamma: float = 1e-1,
         num_workers: int = 6,
         **kwargs,
     ) -> None:
-        """
+        """TransferLearningModel
+
         Args:
-            dl_path: Path where the data will be downloaded
+            backbone: Name (as in ``torchvision.models``) of the feature extractor
+            train_bn: Whether the BatchNorm layers should be trainable
+            milestones: List of two epochs milestones
+            lr: Initial learning rate
+            lr_scheduler_gamma: Factor by which the learning rate is reduced at each milestone
         """
         super().__init__()
         self.backbone = backbone
@@ -269,90 +270,31 @@ def configure_optimizers(self):
         scheduler = MultiStepLR(optimizer, milestones=self.milestones, gamma=self.lr_scheduler_gamma)
         return [optimizer], [scheduler]
 
-    @staticmethod
-    def add_model_specific_args(parent_parser):
-        parser = parent_parser.add_argument_group("TransferLearningModel")
-        parser.add_argument(
-            "--backbone",
-            default="resnet50",
-            type=str,
-            metavar="BK",
-            help="Name (as in ``torchvision.models``) of the feature extractor",
-        )
-        parser.add_argument(
-            "--epochs", default=15, type=int, metavar="N", help="total number of epochs", dest="nb_epochs"
-        )
-        parser.add_argument("--batch-size", default=8, type=int, metavar="B", help="batch size", dest="batch_size")
-        parser.add_argument("--gpus", type=int, default=0, help="number of gpus to use")
-        parser.add_argument(
-            "--lr", "--learning-rate", default=1e-3, type=float, metavar="LR", help="initial learning rate", dest="lr"
-        )
-        parser.add_argument(
-            "--lr-scheduler-gamma",
-            default=1e-1,
-            type=float,
-            metavar="LRG",
-            help="Factor by which the learning rate is reduced at each milestone",
-            dest="lr_scheduler_gamma",
-        )
-        parser.add_argument(
-            "--train-bn",
-            default=False,
-            type=bool,
-            metavar="TB",
-            help="Whether the BatchNorm layers should be trainable",
-            dest="train_bn",
-        )
-        parser.add_argument(
-            "--milestones", default=[2, 4], type=list, metavar="M", help="List of two epochs milestones"
-        )
-        return parent_parser
-
-
-def main(args: argparse.Namespace) -> None:
-    """Train the model.
-
-    Args:
-        args: Model hyper-parameters
-
-    Note:
-        For the sake of the example, the images dataset will be downloaded
-        to a temporary directory.
-    """
 
-    datamodule = CatDogImageDataModule(
-        dl_path=os.path.join(args.root_data_path, 'data'), batch_size=args.batch_size, num_workers=args.num_workers
-    )
-    model = TransferLearningModel(**vars(args))
-    finetuning_callback = MilestonesFinetuning(milestones=args.milestones)
+class MyLightningCLI(LightningCLI):
 
-    trainer = pl.Trainer(
-        weights_summary=None,
-        progress_bar_refresh_rate=1,
-        num_sanity_val_steps=0,
-        gpus=args.gpus,
-        max_epochs=args.nb_epochs,
-        callbacks=[finetuning_callback]
-    )
+    def add_arguments_to_parser(self, parser):
+        parser.add_class_arguments(MilestonesFinetuning, 'finetuning')
+        parser.link_arguments('data.batch_size', 'model.batch_size')
+        parser.link_arguments('finetuning.milestones', 'model.milestones')
+        parser.link_arguments('finetuning.train_bn', 'model.train_bn')
+        parser.set_defaults({
+            'trainer.max_epochs': 15,
+            'trainer.weights_summary': None,
+            'trainer.progress_bar_refresh_rate': 1,
+            'trainer.num_sanity_val_steps': 0,
+        })
 
-    trainer.fit(model, datamodule=datamodule)
+    def instantiate_trainer(self):
+        finetuning_callback = MilestonesFinetuning(**self.config_init['finetuning'])
+        self.trainer_defaults['callbacks'] = [finetuning_callback]
+        super().instantiate_trainer()
 
 
-def get_args() -> argparse.Namespace:
-    parent_parser = argparse.ArgumentParser(add_help=False)
-    parent_parser.add_argument(
-        "--root-data-path",
-        metavar="DIR",
-        type=str,
-        default=Path.cwd().as_posix(),
-        help="Root directory where to download the data",
-        dest="root_data_path",
-    )
-    parser = TransferLearningModel.add_model_specific_args(parent_parser)
-    parser = CatDogImageDataModule.add_argparse_args(parser)
-    return parser.parse_args()
+def cli_main():
+    MyLightningCLI(TransferLearningModel, CatDogImageDataModule, seed_everything_default=1234)
 
 
 if __name__ == "__main__":
     cli_lightning_logo()
-    main(get_args())
+    cli_main()
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import contextlib
 from collections import defaultdict
-from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Union
+from typing import Any, Callable, DefaultDict, Dict, Generator, Iterable, List, Optional, Union
 
 import torch
 from torch import Tensor
@@ -114,7 +114,7 @@ def pre_dispatch(self, trainer: 'pl.Trainer') -> None:
     def _move_optimizer_state(self) -> None:
         """ Moves the state of the optimizers to the GPU if needed. """
         for opt in self.optimizers:
-            state = defaultdict(dict)
+            state: DefaultDict = defaultdict(dict)
             for p, v in opt.state.items():
                 state[p] = apply_to_collection(v, torch.Tensor, move_data_to_device, self.root_device)
             opt.state = state