Merge branch 'master' into no-return-val

Lightning-AI · Apr 28, 2021 · e3b26a1 · e3b26a1
2 parents a0a6a69 + 0c6c078
commit e3b26a1
Show file tree

Hide file tree

Showing 39 changed files with 811 additions and 122 deletions.
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -18,11 +18,17 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-18.04, windows-2019, macOS-10.15]
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.6, 3.8, 3.9]
         requires: ['minimal', 'latest']
+        release: ['stable']
         exclude:
           - python-version: 3.9
             requires: 'minimal'
+        include:
+          - os: ubuntu-20.04
+            python-version: 3.9
+            requires: 'latest'
+            release: 'pre'
 
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     # TODO: the macOS is taking too long, probably caching did not work...
@@ -96,9 +102,9 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
-        key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements/extra.txt') }}
+        key: ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-${{ hashFiles('requirements.txt') }}-${{ hashFiles('requirements/extra.txt') }}
         restore-keys: |
-          ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.requires }}-
+          ${{ runner.os }}-pip-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}-
 
     - name: Pull checkpoints from S3
       run: |
@@ -126,7 +132,8 @@ jobs:
         python --version
         pip --version
         # python -m pip install --upgrade --user pip
-        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade
+        flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1)
+        pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade $flag
         # adjust versions according installed Torch version
         python ./requirements/adjust_versions.py requirements/extra.txt
         python ./requirements/adjust_versions.py requirements/examples.txt
@@ -158,7 +165,7 @@ jobs:
     - name: Tests
       run: |
         # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003
-        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --durations=50 --junitxml=junit/test-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml
 
     - name: Examples
       run: |
@@ -167,8 +174,8 @@ jobs:
     - name: Upload pytest results
       uses: actions/upload-artifact@v2
       with:
-        name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}
-        path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}.xml
+        name: pytest-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}
+        path: junit/test-results-${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml
       if: failure()
 
     - name: Statistics

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+
+- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944/))
+
+
+- Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202))
+
+
 - Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942))
 
 
@@ -114,12 +121,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
+- Added `debug` flag to TPU Training Plugins (PT_XLA_DEBUG) ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219))
+
+
 - Added new `UnrepeatedDistributedSampler` and `IndexBatchSamplerWrapper` for tracking distributed predictions ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
 
 
 - Added `trainer.predict(return_predictions=None|False|True)` ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215))
 
 
+- Added `BasePredictionWriter` callback to implement prediction saving ([#7127](https://github.com/PyTorchLightning/pytorch-lightning/pull/7127))
+
+
+- Added `tpu_distributed` check for TPU Spawn barrier ([#7241](https://github.com/PyTorchLightning/pytorch-lightning/pull/7241))
+
+
 ### Changed
 
 - Renamed `pytorch_lightning.callbacks.swa` to `pytorch_lightning.callbacks.stochastic_weight_avg` ([#6259](https://github.com/PyTorchLightning/pytorch-lightning/pull/6259))
@@ -155,6 +171,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed default setting for communication of multi-node training using `DDPShardedPlugin` ([#6937](https://github.com/PyTorchLightning/pytorch-lightning/pull/6937))
 
 
+- `LightningModule.from_datasets()` now accepts `IterableDataset` instances as training datasets. ([#7503](https://github.com/PyTorchLightning/pytorch-lightning/pull/7503))
+
+
 ### Deprecated
 
 - Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/PyTorchLightning/pytorch-lightning/pull/7201))
@@ -361,6 +380,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed resetting device after `fitting/evaluating/predicting` ([#7188](https://github.com/PyTorchLightning/pytorch-lightning/pull/7188))
 
 
+- Fixed metrics not being properly logged with `precision=16` and `manual_optimization` ([#7228](https://github.com/PyTorchLightning/pytorch-lightning/pull/7228))
+
+
+- Fixed `parameters_to_ignore` not properly set to DDPWrapper ([#7239](https://github.com/PyTorchLightning/pytorch-lightning/pull/7239))
+
+
 ## [1.2.7] - 2021-04-06
 
 ### Fixed

diff --git a/dockers/README.md b/dockers/README.md
@@ -45,7 +45,7 @@ docker image list
 docker image rm pytorch-lightning:latest
 ```
 
-### Run docker image with GPUs
+## Run docker image with GPUs
 
 To run docker image with access to you GPUs you need to install
 ```bash
@@ -63,3 +63,23 @@ and later run the docker image with `--gpus all` so for example
 ```
 docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
 ```
+
+## Run Jupyter server
+
+Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-a-dockerfile
+
+1. Build the docker image:
+    ```bash
+    docker image build \
+        -t pytorch-lightning:v1.2.9 \
+        -f dockers/nvidia/Dockerfile \
+        --build-arg LIGHTNING_VERSION=1.2.9 \
+        .
+    ```
+2. start the server and map ports:
+    ```bash
+    docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.2.9
+    ```
+3. Connect in local browser:
+    - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`
+    - replace the `hostname` by `localhost`
diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_21-03.html#rel_21-03
-FROM nvcr.io/nvidia/pytorch:20.12-py3
+FROM nvcr.io/nvidia/pytorch:21.03-py3
 
 MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
+RUN python -c "import torch ; print(torch.__version__)" >> torch_version.info
+
 COPY ./ /workspace/pytorch-lightning/
 
 RUN \
     cd /workspace  && \
-    mv pytorch-lightning/notebooks . && \
-    mv pytorch-lightning/pl_examples . && \
     # replace by specific version if asked
     if [ ! -z "$LIGHTNING_VERSION" ] ; then \
         rm -rf pytorch-lightning ; \
@@ -33,18 +33,28 @@ RUN \
         mv pytorch-lightning-*/ pytorch-lightning ; \
         rm *.zip ; \
     fi && \
+# save the examples
+    mv pytorch-lightning/notebooks . && \
+    mv pytorch-lightning/pl_examples . && \
 
 # Installations
     python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
     pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install ./pytorch-lightning --no-cache-dir && \
-    pip install "Pillow>=8.1" "torchtext>=0.9.0" ipython[all] --no-cache-dir --upgrade-strategy only-if-needed && \
-    rm -rf pytorch-lightning
+    pip install "Pillow>=8.1" --no-cache-dir --upgrade-strategy only-if-needed && \
+    rm -rf pytorch-lightning && \
+    pip list
+
+ENV PYTHONPATH="/workspace"
 
-RUN python --version && \
+RUN \
+    TORCH_VERSION=$(cat torch_version.info) && \
+    rm torch_version.info && \
+    python --version && \
     pip --version && \
-    pip list && \
+    pip list | grep torch && \
+    python -c "from torch import __version__ as ver ; assert ver == '$TORCH_VERSION', ver" && \
     python -c "import pytorch_lightning as pl; print(pl.__version__)"
 
-# CMD ["/bin/bash"]
+CMD ["jupyter", "notebook", "--port=8888", "--no-browser", "--ip=0.0.0.0", "--allow-root"]
diff --git a/docs/source/advanced/multi_gpu.rst b/docs/source/advanced/multi_gpu.rst
@@ -282,7 +282,10 @@ Data Parallel
 That is, if you have a batch of 32 and use DP with 2 gpus, each GPU will process 16 samples,
 after which the root node will aggregate the results.
 
-.. warning:: DP use is discouraged by PyTorch and Lightning. Use DDP which is more stable and at least 3x faster
+.. warning:: DP use is discouraged by PyTorch and Lightning. State is not maintained on the replicas created by the
+    :class:`~torch.nn.DataParallel` wrapper and you may see errors or misbehavior if you assign state to the module
+    in the ``forward()`` or ``*_step()`` methods. For the same reason we do cannot fully support
+    :ref:`manual_optimization` with DP. Use DDP which is more stable and at least 3x faster.
 
 .. testcode::
     :skipif: torch.cuda.device_count() < 2

diff --git a/docs/source/common/lightning_cli.rst b/docs/source/common/lightning_cli.rst
@@ -33,6 +33,9 @@
     MyModelBaseClass = MyModel
     MyDataModuleBaseClass = MyDataModule
 
+    EncoderBaseClass = MyModel
+    DecoderBaseClass = MyModel
+
     mock_argv = mock.patch("sys.argv", ["any.py"])
     mock_argv.start()
 
@@ -116,7 +119,7 @@ The start of a possible implementation of :class:`MyModel` including the recomme
 docstring could be the one below. Note that by using type hints and docstrings there is no need to duplicate this
 information to define its configurable arguments.
 
-.. code-block:: python
+.. testcode::
 
     class MyModel(LightningModule):
 
@@ -131,7 +134,8 @@ information to define its configurable arguments.
                 encoder_layers: Number of layers for the encoder
                 decoder_layers: Number of layers for each decoder block
             """
-            ...
+            super().__init__()
+            self.save_hyperparameters()
 
 With this model class, the help of the trainer tool would look as follows:
 
@@ -258,7 +262,67 @@ A possible config file could be as follows:
         ...
 
 Only model classes that are a subclass of :code:`MyModelBaseClass` would be allowed, and similarly only subclasses of
-:code:`MyDataModuleBaseClass`.
+:code:`MyDataModuleBaseClass`. If as base classes :class:`~pytorch_lightning.core.lightning.LightningModule` and
+:class:`~pytorch_lightning.core.datamodule.LightningDataModule` are given, then the tool would allow any lightning
+module and data module.
+
+.. tip::
+
+    Note that with the subclass modes the :code:`--help` option does not show information for a specific subclass. To
+    get help for a subclass the options :code:`--model.help` and :code:`--data.help` can be used, followed by the
+    desired class path. Similarly :code:`--print_config` does not include the settings for a particular subclass. To
+    include them the class path should be given before the :code:`--print_config` option. Examples for both help and
+    print config are:
+
+    .. code-block:: bash
+
+        $ python trainer.py --model.help mycode.mymodels.MyModel
+        $ python trainer.py --model mycode.mymodels.MyModel --print_config
+
+
+Models with multiple submodules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many use cases require to have several modules each with its own configurable options. One possible way to handle this
+with LightningCLI is to implement a single module having as init parameters each of the submodules. Since the init
+parameters have as type a class, then in the configuration these would be specified with :code:`class_path` and
+:code:`init_args` entries. For instance a model could be implemented as:
+
+.. testcode::
+
+    class MyMainModel(LightningModule):
+
+        def __init__(
+            self,
+            encoder: EncoderBaseClass,
+            decoder: DecoderBaseClass
+        ):
+            """Example encoder-decoder submodules model
+
+            Args:
+                encoder: Instance of a module for encoding
+                decoder: Instance of a module for decoding
+            """
+            super().__init__()
+            self.encoder = encoder
+            self.decoder = decoder
+
+If the CLI is implemented as :code:`LightningCLI(MyMainModel)` the configuration would be as follows:
+
+.. code-block:: yaml
+
+    model:
+      encoder:
+        class_path: mycode.myencoders.MyEncoder
+        init_args:
+          ...
+      decoder:
+        class_path: mycode.mydecoders.MyDecoder
+        init_args:
+          ...
+
+It is also possible to combine :code:`subclass_mode_model=True` and submodules, thereby having two levels of
+:code:`class_path`.
 
 
 Customizing LightningCLI
@@ -275,7 +339,7 @@ extended to customize different parts of the command line tool. The argument par
 adding arguments can be done using the :func:`add_argument` method. In contrast to argparse it has additional methods to
 add arguments, for example :func:`add_class_arguments` adds all arguments from the init of a class, though requiring
 parameters to have type hints. For more details about this please refer to the `respective documentation
-<https://omni-us.github.io/jsonargparse/#classes-methods-and-functions>`_.
+<https://jsonargparse.readthedocs.io/en/stable/#classes-methods-and-functions>`_.
 
 The :class:`~pytorch_lightning.utilities.cli.LightningCLI` class has the
 :meth:`~pytorch_lightning.utilities.cli.LightningCLI.add_arguments_to_parser` method which can be implemented to include

diff --git a/docs/source/common/optimizers.rst b/docs/source/common/optimizers.rst
@@ -15,6 +15,8 @@ For advanced/expert users who want to do esoteric optimization schedules or tech
 
 -----
 
+.. _manual_optimization:
+
 Manual optimization
 ===================
 For advanced research topics like reinforcement learning, sparse coding, or GAN research, it may be desirable to

diff --git a/docs/source/extensions/callbacks.rst b/docs/source/extensions/callbacks.rst
@@ -104,6 +104,7 @@ Lightning has a few built-in callbacks.
     LearningRateMonitor
     ModelCheckpoint
     ModelPruning
+    BasePredictionWriter
     ProgressBar
     ProgressBarBase
     QuantizationAwareTraining

diff --git a/pl_examples/domain_templates/computer_vision_fine_tuning.py b/pl_examples/domain_templates/computer_vision_fine_tuning.py
@@ -225,7 +225,7 @@ def forward(self, x):
         # 2. Classifier (returns logits):
         x = self.fc(x)
 
-        return torch.sigmoid(x)
+        return x
 
     def loss(self, logits, labels):
         return self.loss_func(input=logits, target=labels)
@@ -234,27 +234,29 @@ def training_step(self, batch, batch_idx):
         # 1. Forward pass:
         x, y = batch
         y_logits = self.forward(x)
+        y_scores = torch.sigmoid(y_logits)
         y_true = y.view((-1, 1)).type_as(x)
 
         # 2. Compute loss
         train_loss = self.loss(y_logits, y_true)
 
         # 3. Compute accuracy:
-        self.log("train_acc", self.train_acc(y_logits, y_true.int()), prog_bar=True)
+        self.log("train_acc", self.train_acc(y_scores, y_true.int()), prog_bar=True)
 
         return train_loss
 
     def validation_step(self, batch, batch_idx):
         # 1. Forward pass:
         x, y = batch
         y_logits = self.forward(x)
+        y_scores = torch.sigmoid(y_logits)
         y_true = y.view((-1, 1)).type_as(x)
 
         # 2. Compute loss
         self.log("val_loss", self.loss(y_logits, y_true), prog_bar=True)
 
         # 3. Compute accuracy:
-        self.log("val_acc", self.valid_acc(y_logits, y_true.int()), prog_bar=True)
+        self.log("val_acc", self.valid_acc(y_scores, y_true.int()), prog_bar=True)
 
     def configure_optimizers(self):
         parameters = list(self.parameters())