Merge branch 'master' into fix_dp_logging_aggregation

Lightning-AI · Dec 4, 2020 · bfe5395 · bfe5395
2 parents 19f4fa7 + ed5bda3
commit bfe5395
Show file tree

Hide file tree

Showing 21 changed files with 319 additions and 167 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -5,23 +5,48 @@
 # the repo. Unless a later match takes precedence,
 # @global-owner1 and @global-owner2 will be requested for
 # review when someone opens a pull request.
-* @williamfalcon @borda @teddykoker @awaelchli @nateraw @justusschock @tchaton @SeanNaren @ananyahjha93
+* @williamfalcon @borda @tchaton @SeanNaren @awaelchli @justusschock
 
 # Metrics
-/pytorch_lightning/metrics/*  @teddykoker @ananyahjha93 @justusschock
-/tests/metrics/*              @teddykoker @ananyahjha93 @justusschock
+/pytorch_lightning/metrics/   @teddykoker @ananyahjha93 @justusschock
+/tests/metrics/               @teddykoker @ananyahjha93 @justusschock
 /docs/source/metrics.rst      @teddykoker @ananyahjha93 @justusschock
 
 # API
-/pytorch_lightning/callbacks/base.py  @williamfalcon
-/pytorch_lightning/core/datamodule.py @williamfalcon
-/pytorch_lightning/trainer/trainer.py @williamfalcon
-/pytorch_lightning/core/hooks.py     @williamfalcon
-/pytorch_lightning/core/lightning.py  @williamfalcon
+/pytorch_lightning/callbacks/base.py           @williamfalcon
+/pytorch_lightning/core/datamodule.py          @williamfalcon
+/pytorch_lightning/trainer/trainer.py          @williamfalcon @tchaton
+/pytorch_lightning/core/hooks.py               @williamfalcon
+/pytorch_lightning/core/lightning.py           @williamfalcon @tchaton
+/pytorch_lightning/core/optimizer.py           @tchaton
+/pytorch_lightning/trainer/training_loop.py    @tchaton @SeanNaren
+/pytorch_lightning/trainer/evaluation_loop.py  @tchaton @SeanNaren
 
+# Connectors
+/pytorch_lightning/trainer/connectors/         @tchaton @SeanNaren
 
 # accelerators
-/pytorch_lightning/accelerators/*  @williamfalcon
+/pytorch_lightning/accelerators/               @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock
 
 # owners
-/pytorch_lightning/.github/CODEOWNERS @williamfalcon
+/.github/CODEOWNERS   @williamfalcon
+# main
+/README.md            @williamfalcon @edenlightning
+# installation
+/setup.py             @borda @williamfalcon
+
+# CI/CD
+/.github/workflows/   @borda @tchaton
+/.github/*.py         @borda @tchaton
+/dockers/             @borda @tchaton
+# configs in root
+/*.yml                @borda @tchaton
+
+# Docs
+/docs/                          @edenlightning @tchaton @borda @awaelchli
+/.github/*.md                   @edenlightning @williamfalcon @borda
+/.github/ISSUE_TEMPLATE/*.md    @edenlightning @borda @tchaton
+/docs/source/conf.py            @borda @awaelchli
+
+# Testing
+/tests/base/boring_model.py @williamfalcon
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
-## [1.1.0rc1] - 2020-12-02
+## [1.1.0rc] - 2020-12-02
 
 ### Added
 
@@ -89,6 +89,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `prefix` argument in `ModelCheckpoint` ([#4765](https://github.com/PyTorchLightning/pytorch-lightning/pull/4765))
 
 
+- Deprecated the old way of assigning hyper-parameters through `self.hparams = ...` ([#4813](https://github.com/PyTorchLightning/pytorch-lightning/pull/4813))
+
+
+- Deprecated `mode='auto'` from `ModelCheckpoint` and `EarlyStopping` ([#4695](https://github.com/PyTorchLightning/pytorch-lightning/pull/4695))
+
+
 ### Removed
 
 

diff --git a/docs/source/hyperparameters.rst b/docs/source/hyperparameters.rst
@@ -167,8 +167,8 @@ improve readability and reproducibility.
             def train_dataloader(self):
                 return DataLoader(mnist_train, batch_size=self.hparams.batch_size)
 
-    .. warning:: Deprecated. This method of assigning hyperparameters to the LightningModule is no longer
-        recommended and will not be supported in future versions of Lightning.
+    .. warning:: Deprecated since v1.1.0. This method of assigning hyperparameters to the LightningModule
+        will no longer be supported from v1.3.0. Use the ``self.save_hyperparameters()`` method from above instead.
 
 
 4.  You can also save full objects such as `dict` or `Namespace` to the checkpoint.

diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
@@ -14,6 +14,7 @@ Lightning automates saving and loading checkpoints. Checkpoints capture the exac
 
 Checkpointing your training allows you to resume a training process in case it was interrupted, fine-tune a model or use a pre-trained model for inference without having to retrain the model.
 
+
 *****************
 Checkpoint saving
 *****************
@@ -68,7 +69,7 @@ You can customize the checkpointing behavior to monitor any quantity of your tra
     # 4. Add your callback to the callbacks list
     trainer = Trainer(callbacks=[checkpoint_callback])
 
-You can also control more advanced options, like `save_top_k`, to save the best k models and the mode of the monitored quantity (min/max/auto, where the mode is automatically inferred from the name of the monitored quantity), `save_weights_only` or `period` to set the interval of epochs between checkpoints, to avoid slowdowns.
+You can also control more advanced options, like `save_top_k`, to save the best k models and the `mode` of the monitored quantity (min/max), `save_weights_only` or `period` to set the interval of epochs between checkpoints, to avoid slowdowns.
 
 .. code-block:: python
 
@@ -84,10 +85,11 @@ You can also control more advanced options, like `save_top_k`, to save the best
     # saves a file like: my/path/sample-mnist-epoch=02-val_loss=0.32.ckpt
     checkpoint_callback = ModelCheckpoint(
         monitor='val_loss',
-        dirpath='my/path/,
+        dirpath='my/path/',
         filename='sample-mnist-{epoch:02d}-{val_loss:.2f}',
         save_top_k=3,
-        mode='min')
+        mode='min',
+    )
 
     trainer = Trainer(callbacks=[checkpoint_callback])
     
@@ -137,6 +139,23 @@ You can manually save checkpoints and restore your model from the checkpointed s
     trainer.save_checkpoint("example.ckpt")
     new_model = MyModel.load_from_checkpoint(checkpoint_path="example.ckpt")
 
+Manual saving with accelerators
+===============================
+
+Lightning also handles accelerators where multiple processes are running, such as DDP. For example, when using the DDP accelerator our training script is running across multiple devices at the same time.
+Lightning automatically ensures that the model is saved only on the main process, whilst other processes do not interfere with saving checkpoints. This requires no code changes as seen below.
+
+.. code-block:: python
+
+    trainer = Trainer(accelerator="ddp")
+    model = MyLightningModule(hparams)
+    trainer.fit(model)
+    # Saves only on the main process
+    trainer.save_checkpoint("example.ckpt")
+
+Not using `trainer.save_checkpoint` can lead to unexpected behaviour and potential deadlock. Using other saving functions will result in all devices attempting to save the checkpoint. As a result, we highly recommend using the trainer's save functionality.
+If using custom saving functions cannot be avoided, we recommend using :func:`~pytorch_lightning.loggers.base.rank_zero_only` to ensure saving occurs only on the main process.
+
 ******************
 Checkpoint loading
 ******************

diff --git a/pl_examples/domain_templates/imagenet.py b/pl_examples/domain_templates/imagenet.py
@@ -19,7 +19,6 @@
 """
 import os
 from argparse import ArgumentParser, Namespace
-from collections import OrderedDict
 
 import torch
 import torch.nn.functional as F
@@ -37,24 +36,23 @@
 
 
 class ImageNetLightningModel(LightningModule):
-
     # pull out resnet names from torchvision models
     MODEL_NAMES = sorted(
         name for name in models.__dict__
         if name.islower() and not name.startswith("__") and callable(models.__dict__[name])
     )
 
     def __init__(
-        self,
-        arch: str,
-        pretrained: bool,
-        lr: float,
-        momentum: float,
-        weight_decay: int,
-        data_path: str,
-        batch_size: int,
-        workers: int,
-        **kwargs,
+            self,
+            arch: str,
+            pretrained: bool,
+            lr: float,
+            momentum: float,
+            weight_decay: int,
+            data_path: str,
+            batch_size: int,
+            workers: int,
+            **kwargs,
     ):
         super().__init__()
         self.save_hyperparameters()
@@ -74,39 +72,21 @@ def forward(self, x):
     def training_step(self, batch, batch_idx):
         images, target = batch
         output = self(images)
-        loss_val = F.cross_entropy(output, target)
+        loss_train = F.cross_entropy(output, target)
         acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
-
-        tqdm_dict = {'train_loss': loss_val}
-        output = OrderedDict({
-            'loss': loss_val,
-            'acc1': acc1,
-            'acc5': acc5,
-            'progress_bar': tqdm_dict,
-            'log': tqdm_dict
-        })
-        return output
+        self.log('train_loss', loss_train, on_step=True, on_epoch=True, logger=True)
+        self.log('train_acc1', acc1, on_step=True, prog_bar=True, on_epoch=True, logger=True)
+        self.log('train_acc5', acc5, on_step=True, on_epoch=True, logger=True)
+        return loss_train
 
     def validation_step(self, batch, batch_idx):
         images, target = batch
         output = self(images)
         loss_val = F.cross_entropy(output, target)
         acc1, acc5 = self.__accuracy(output, target, topk=(1, 5))
-
-        output = OrderedDict({
-            'val_loss': loss_val,
-            'val_acc1': acc1,
-            'val_acc5': acc5,
-        })
-        return output
-
-    def validation_epoch_end(self, outputs):
-        tqdm_dict = {}
-        for metric_name in ["val_loss", "val_acc1", "val_acc5"]:
-            tqdm_dict[metric_name] = torch.stack([output[metric_name] for output in outputs]).mean()
-
-        result = {'progress_bar': tqdm_dict, 'log': tqdm_dict, 'val_loss': tqdm_dict["val_loss"]}
-        return result
+        self.log('val_loss', loss_val, on_step=True, on_epoch=True)
+        self.log('val_acc1', acc1, on_step=True, prog_bar=True, on_epoch=True)
+        self.log('val_acc5', acc5, on_step=True, on_epoch=True)
 
     @staticmethod
     def __accuracy(output, target, topk=(1,)):
@@ -121,7 +101,7 @@ def __accuracy(output, target, topk=(1,)):
 
             res = []
             for k in topk:
-                correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+                correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
                 res.append(correct_k.mul_(100.0 / batch_size))
             return res
 

diff --git a/pl_examples/domain_templates/reinforce_learn_Qnet.py b/pl_examples/domain_templates/reinforce_learn_Qnet.py
@@ -1,19 +1,22 @@
 """
 Deep Reinforcement Learning: Deep Q-network (DQN)
 
-This example is based on https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-
-Second-Edition/blob/master/Chapter06/02_dqn_pong.py
-
 The template illustrates using Lightning for Reinforcement Learning. The example builds a basic DQN using the
 classic CartPole environment.
 
 To run the template, just run:
-python reinforce_learn_Qnet.py
+`python reinforce_learn_Qnet.py`
+
+After ~1500 steps, you will see the total_reward hitting the max score of 200.
+Open up TensorBoard to see the metrics:
 
-After ~1500 steps, you will see the total_reward hitting the max score of 200. Open up TensorBoard to
-see the metrics:
+`tensorboard --logdir default`
 
-tensorboard --logdir default
+References
+----------
+
+[1] https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-
+Second-Edition/blob/master/Chapter06/02_dqn_pong.py
 """
 
 import argparse

diff --git a/pytorch_lightning/accelerators/accelerator_connector.py b/pytorch_lightning/accelerators/accelerator_connector.py
@@ -15,21 +15,18 @@
 
 import torch
 
+from pytorch_lightning.utilities import HOROVOD_AVAILABLE
 from pytorch_lightning import _logger as log
 from pytorch_lightning import accelerators
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment
 from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment
-from pytorch_lightning.utilities import XLA_AVAILABLE, device_parser, rank_zero_only, TPU_AVAILABLE
+from pytorch_lightning.utilities import device_parser, rank_zero_only, TPU_AVAILABLE
 from pytorch_lightning.utilities.distributed import rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
-try:
+if HOROVOD_AVAILABLE:
     import horovod.torch as hvd
-except (ModuleNotFoundError, ImportError):
-    HOROVOD_AVAILABLE = False
-else:
-    HOROVOD_AVAILABLE = True
 
 
 class AcceleratorConnector:

diff --git a/pytorch_lightning/accelerators/horovod_accelerator.py b/pytorch_lightning/accelerators/horovod_accelerator.py
@@ -18,15 +18,11 @@
 from torch.optim.lr_scheduler import _LRScheduler
 
 from pytorch_lightning.accelerators.accelerator import Accelerator, ReduceOp
-from pytorch_lightning.utilities import AMPType
+from pytorch_lightning.utilities import AMPType, HOROVOD_AVAILABLE
 from pytorch_lightning.utilities.distributed import rank_zero_only
 
-try:
+if HOROVOD_AVAILABLE:
     import horovod.torch as hvd
-except (ModuleNotFoundError, ImportError):
-    HOROVOD_AVAILABLE = False
-else:
-    HOROVOD_AVAILABLE = True
 
 
 class HorovodAccelerator(Accelerator):