From ce8abd62553a449630f992696a2a8eb02ca7f724 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 26 Oct 2020 11:47:09 +0100
Subject: [PATCH 1/6] Drone: use nightly build cuda docker images (#3658)

* upgrade PT version

* update docker

* docker

* try 1.5

* badge

* fix typo: dor -> for (#3918)

* prune

* prune

* env

* echo

* try

* notes

* env

* env

* env

* notes

* docker

* prune

* maintainer

* CI

* update

* just 1.5

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* docker

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* push

* try

* prune

* CI

* CI

* CI

* CI

Co-authored-by: Klyukin Valeriy <mr.clyukin@gmail.com>
Co-authored-by: Jeff Yang <ydcjeff@outlook.com>
---
 .drone.yml                                    |  34 +----
 .github/workflows/ci_dockers.yml              |  84 ++++++++----
 .github/workflows/ci_test-conda.yml           |   8 +-
 .github/workflows/docker-builds.yml           |   4 +-
 .github/workflows/nightly.yml                 |  50 ++++----
 README.md                                     |   4 +-
 dockers/README.md                             |  25 +++-
 dockers/base-conda/Dockerfile                 | 121 ++++++++++++++++++
 dockers/base-cuda/Dockerfile                  | 109 ++++++++--------
 dockers/base-xla/Dockerfile                   |   5 +-
 dockers/{conda => release}/Dockerfile         |   6 +-
 dockers/tpu-tests/Dockerfile                  |   2 +
 .../cluster_environment.py                    |   1 +
 .../cluster_environments/slurm_environment.py |   1 +
 .../torchelastic_environment.py               |   3 +-
 requirements/devel.txt                        |   2 +-
 16 files changed, 304 insertions(+), 155 deletions(-)
 create mode 100644 dockers/base-conda/Dockerfile
 rename dockers/{conda => release}/Dockerfile (94%)

diff --git a/.drone.yml b/.drone.yml
index 5c759f042e6616..bb4d8a74b28f53 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -20,44 +20,21 @@ name: torch-GPU
 
 steps:
 - name: testing
-  image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5
+  image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5
 
   environment:
-    SLURM_LOCALID: 0
     CODECOV_TOKEN:
       from_secret: codecov_token
     MKL_THREADING_LAYER: GNU
-    HOROVOD_GPU_OPERATIONS: NCCL
-    HOROVOD_WITH_PYTORCH: 1
-    HOROVOD_WITHOUT_TENSORFLOW: 1
-    HOROVOD_WITHOUT_MXNET: 1
-    HOROVOD_WITH_GLOO: 1
-    HOROVOD_WITHOUT_MPI: 1
-
-  #volumes:
-  #  # Mount pip cache from host
-  #  - name: pip_cache
-  #    path: /opt/conda/lib/python3.7/site-packages
 
   commands:
-    # todo: remove unsets as in correct image Horovod shall be set
-    - unset HOROVOD_GPU_ALLREDUCE
-    - unset HOROVOD_GPU_BROADCAST
-    - export PATH="$PATH:/root/.local/bin"
     - python --version
-    - pip install pip -U
     - pip --version
     - nvidia-smi
-    #- bash ./requirements/install_AMP.sh
-    - apt-get update && apt-get install -y cmake
-    - pip uninstall -y horovod  # todo: this shall not be needed
-    - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed  --no-cache-dir
-    #- pip install -r ./requirements/docs.txt --user -q
-    - pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed
+    - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir
     - pip list
-    - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')"
     - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8
-    - python -m py.test benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8
+    - python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8
     #- cd docs; make doctest; make coverage
     - coverage report
     # see: https://docs.codecov.io/docs/merging-reports
@@ -73,8 +50,3 @@ trigger:
     include:
     - push
     - pull_request
-
-#volumes:
-#  - name: pip_cache
-#    host:
-#      path: /tmp/cache/drone/pip
diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index 017d4e637bacda..c8816486f26884 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -9,7 +9,7 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
     branches: [master]
 
 jobs:
-  build-Conda:
+  build-PL:
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
@@ -21,18 +21,16 @@ jobs:
         uses: actions/checkout@v2
 
       # https://github.com/docker/setup-buildx-action
-      # to use cache-from and cache-to argument of buildx command
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
-      - name: Build Conda Docker
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
+      - name: Build PL Docker
         # publish master
         uses: docker/build-push-action@v2
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
-          file: dockers/conda/Dockerfile
+          file: dockers/release/Dockerfile
           push: false
         timeout-minutes: 50
 
@@ -48,10 +46,8 @@ jobs:
         uses: actions/checkout@v2
 
       # https://github.com/docker/setup-buildx-action
-      # to use cache-from and cache-to argument of buildx command
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
       - name: Build XLA Docker
         # publish master
         uses: docker/build-push-action@v2
@@ -70,24 +66,25 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          #- python_version: 3.7
-          #  pytorch_version: 1.8  # todo
-          #  pytorch_channel: pytorch-nightly
-          - python_version: 3.8
+          #- python_version: 3.8
+          #  pytorch_version: 1.7  # todo
+          - python_version: 3.7
             pytorch_version: 1.6
-            pytorch_channel: pytorch
           - python_version: 3.6
-            pytorch_version: 1.5
-            pytorch_channel: pytorch
+            pytorch_version: 1.3
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
-      # https://github.com/docker/setup-buildx-action
-      # to use cache-from and cache-to argument of buildx command
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
+      # for PT 1.3 and 1.4 we need to use CUDA 10.1
+      - run: |
+          cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
+          echo "::set-output name=CUDA::$cuda"
+        id: extend
 
+      # https://github.com/docker/setup-buildx-action
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
       - name: Build CUDA Docker
         # publish master
         uses: docker/build-push-action@v2
@@ -95,8 +92,49 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
-            PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
+            CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
           cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
           file: dockers/base-cuda/Dockerfile
           push: false
         timeout-minutes: 50
+
+  build-conda:
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python_version: 3.8
+            pytorch_version: 1.6
+          - python_version: 3.6
+            pytorch_version: 1.4
+          #- python_version: 3.7
+          #  pytorch_version: 1.8  # todo
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      # for PT 1.3 and 1.4 we need to use CUDA 10.1
+      - run: |
+          cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
+          echo "::set-output name=CUDA::$cuda"
+          channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1)
+          echo "::set-output name=CHANNEL::$channel"
+        id: extend
+
+      # https://github.com/docker/setup-buildx-action
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
+      - name: Build CUDA Docker
+        # publish master
+        uses: docker/build-push-action@v2
+        with:
+          build-args: |
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
+            CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
+          cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          file: dockers/base-conda/Dockerfile
+          push: false
+        timeout-minutes: 50
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index 3289f5cbecf5ef..f652cbb1a4b58c 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -9,14 +9,14 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
 
 jobs:
   conda:
-    runs-on: ${{ matrix.os }}
-    container: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}
+    runs-on: ubuntu-20.04
+    container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-20.04]
+        # os: [ubuntu-20.04]
         python-version: [3.7]
-        pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7]
+        pytorch-version: [1.3, 1.4, 1.5, 1.6]  # , 1.7 # todo
 
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 35
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 1dc2b7c4e04953..0ba6f701f65d6c 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -8,7 +8,7 @@ on:
     types: [created]
 
 jobs:
-  build-Conda:
+  build-PL:
     runs-on: ubuntu-20.04
     strategy:
       fail-fast: false
@@ -36,7 +36,7 @@ jobs:
           repository: pytorchlightning/pytorch_lightning
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-          dockerfile: dockers/conda/Dockerfile
+          dockerfile: dockers/release/Dockerfile
           build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
           tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
         timeout-minutes: 55
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index c91033b65f62c6..eb10c439360446 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -8,6 +8,7 @@ on:
 
 # based on https://github.com/pypa/gh-action-pypi-publish
 jobs:
+
   pypi-release:
     runs-on: ubuntu-20.04
 
@@ -47,10 +48,8 @@ jobs:
         uses: actions/checkout@v2
 
       # https://github.com/docker/setup-buildx-action
-      # to use cache-from and cache-to argument of buildx command
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
       - name: Login to DockerHub
         uses: docker/login-action@v1
         with:
@@ -78,37 +77,32 @@ jobs:
       matrix:
         python_version: [3.6, 3.7, 3.8]
         pytorch_version: [1.3, 1.4, 1.5, 1.6]  # todo: , 1.7
-        pytorch_channel: ["pytorch", "pytorch-nightly"]
-        # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
         exclude:
-          - pytorch_version: 1.7
-            pytorch_channel: pytorch
-          - pytorch_version: 1.3
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.4
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.5
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.6
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.3
-            pytorch_channel: pytorch
-            python_version: 3.8
+          # excludes PT 1.3 as it is missing on pypi
+          - python_version: 3.8
+            pytorch_version: 1.3
+
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
       # https://github.com/docker/setup-buildx-action
-      # to use cache-from and cache-to argument of buildx command
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-
+      # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
+      - uses: docker/setup-buildx-action@v1
       - name: Login to DockerHub
         uses: docker/login-action@v1
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
 
+      # for PT 1.3 and 1.4 we need to use CUDA 10.1
+      - run: |
+          cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
+          echo "::set-output name=CUDA::$cuda"
+          channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1)
+          echo "::set-output name=CHANNEL::$channel"
+        id: extend
+
       - name: Publish CUDA to Docker Hub
         # publish master
         uses: docker/build-push-action@v2
@@ -116,7 +110,7 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
-            PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
+            CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
           cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
           cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
           file: dockers/base-cuda/Dockerfile
@@ -131,7 +125,11 @@ jobs:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
-          file: dockers/conda/Dockerfile
+            PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
+            CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
+          cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          file: dockers/base-conda/Dockerfile
           push: true
-          tags: pytorchlightning/pytorch_lightning:nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 55
diff --git a/README.md b/README.md
index 54552367deb05a..21f4aaab19ad11 100644
--- a/README.md
+++ b/README.md
@@ -91,8 +91,8 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 
 | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) |
 | :---: | :---: | :---: | :---: | :---: | :---: |
-| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
-| Linux py3.7 [GPUs**] | - | - |[![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
+| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - |
+| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
 | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
 | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
 | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
diff --git a/dockers/README.md b/dockers/README.md
index 7b3063e00f79c4..73c40635eb0a57 100644
--- a/dockers/README.md
+++ b/dockers/README.md
@@ -1,4 +1,6 @@
-## Builds
+# Docker images
+
+## Builds images form attached Dockerfiles
 
 You can build it on your own, note it takes lots of time, be prepared.
 
@@ -31,4 +33,23 @@ and if you do not need it anymore, just clean it:
 ```bash
 docker image list
 docker image rm pytorch-lightning:latest
-```
\ No newline at end of file
+```
+
+### Run docker image with GPUs
+
+To run docker image with access to you GPUs you need to install
+```bash
+# Add the package repositories
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
+sudo systemctl restart docker
+```
+
+and later run the docker image with `--gpus all` so for example
+
+```
+docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
+```
diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile
new file mode 100644
index 00000000000000..6a7f03970cf754
--- /dev/null
+++ b/dockers/base-conda/Dockerfile
@@ -0,0 +1,121 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Existing images:
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.8 --build-arg PYTORCH_CHANNEL=pytorch-nightly
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch
+
+ARG CUDNN_VERSION=8
+ARG CUDA_VERSION=10.2
+
+# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
+# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
+
+ARG PYTHON_VERSION=3.7
+ARG PYTORCH_VERSION=1.6
+ARG PYTORCH_CHANNEL=pytorch
+ARG CONDA_VERSION=4.7.12
+
+SHELL ["/bin/bash", "-c"]
+
+ENV PATH="$PATH:/root/.local/bin"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        git \
+        curl \
+        ca-certificates \
+    && \
+
+# Install conda and python.
+# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
+    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b && \
+    rm ~/miniconda.sh && \
+
+# Cleaning
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /root/.cache && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV PATH="/root/miniconda3/bin:$PATH"
+ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH"
+ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
+
+ENV HOROVOD_GPU_OPERATIONS=NCCL
+ENV HOROVOD_WITH_PYTORCH=1
+ENV HOROVOD_WITHOUT_TENSORFLOW=1
+ENV HOROVOD_WITHOUT_MXNET=1
+ENV HOROVOD_WITH_GLOO=1
+ENV HOROVOD_WITHOUT_MPI=1
+#ENV MAKEFLAGS="-j$(nproc)"
+ENV MAKEFLAGS="-j1"
+ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5"
+
+ENV CONDA_ENV=lightning
+COPY environment.yml environment.yml
+
+# conda init
+RUN conda create -y --name $CONDA_ENV && \
+    conda init bash && \
+    # NOTE: this requires that the channel is presented in the yaml before packages
+    # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later
+    python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
+    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
+    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
+    python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \
+    cat environment.yml && \
+    conda env update --file environment.yml && \
+    conda clean -ya && \
+    rm environment.yml
+
+ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH
+ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
+# if you want this environment to be the default one, uncomment the following line:
+ENV CONDA_DEFAULT_ENV=${CONDA_ENV}
+
+COPY ./requirements/extra.txt requirements-extra.txt
+COPY ./requirements/test.txt requirements-test.txt
+
+RUN \
+    # Disable cache
+    pip config set global.cache-dir false && \
+    #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
+    #source ~/.bashrc && \
+    # Install remaining requirements
+    pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \
+    pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \
+    rm requirements*
+
+RUN \
+    # install NVIDIA AMP
+    git clone https://github.com/NVIDIA/apex && \
+    pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
+    rm -rf apex
+
+RUN \
+    # Show what we have
+    pip --version && \
+    conda info && \
+    pip list && \
+    python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \
+    python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__"
\ No newline at end of file
diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
index ee892bccb289bc..e22b5a862a7d76 100644
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@@ -13,106 +13,97 @@
 # limitations under the License.
 
 # Existing images:
-# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1
-# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
-# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
-# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
-# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg CUDA_VERSION=10.2
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg CUDA_VERSION=10.2
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.2
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1
+# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1
 
-ARG CUDNN_VERSION=7
-ARG CUDA_VERSION=10.1
+ARG CUDNN_VERSION=8
+ARG CUDA_VERSION=10.2
 
 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu16.04
+FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
-# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu16.04
 
 ARG PYTHON_VERSION=3.7
 ARG PYTORCH_VERSION=1.6
-ARG PYTORCH_CHANNEL=pytorch
-ARG CONDA_VERSION=4.7.12
 
 SHELL ["/bin/bash", "-c"]
+# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Europe/Prague
 
 ENV PATH="$PATH:/root/.local/bin"
+ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
         build-essential \
+        pkg-config \
         cmake \
         git \
-        curl \
+        wget \
         ca-certificates \
+        software-properties-common \
+    && \
+
+# Install python
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get install -y \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-distutils \
+        python${PYTHON_VERSION}-dev \
     && \
-# Install conda and python.
-# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
-    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
-    chmod +x ~/miniconda.sh && \
-    ~/miniconda.sh -b && \
-    rm ~/miniconda.sh && \
+
+    update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
+
 # Cleaning
     apt-get autoremove -y && \
     apt-get clean && \
     rm -rf /root/.cache && \
     rm -rf /var/lib/apt/lists/*
 
-ENV PATH="/root/miniconda3/bin:$PATH"
-ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH"
-ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
-
 ENV HOROVOD_GPU_OPERATIONS=NCCL
 ENV HOROVOD_WITH_PYTORCH=1
 ENV HOROVOD_WITHOUT_TENSORFLOW=1
 ENV HOROVOD_WITHOUT_MXNET=1
 ENV HOROVOD_WITH_GLOO=1
 ENV HOROVOD_WITHOUT_MPI=1
+#ENV MAKEFLAGS="-j$(nproc)"
+ENV MAKEFLAGS="-j1"
+ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5"
 
-ENV CONDA_ENV=lightning
-COPY environment.yml environment.yml
+COPY ./requirements.txt requirements.txt
+COPY ./requirements/ ./requirements/
 
 # conda init
-RUN conda create -y --name $CONDA_ENV "cudatoolkit=$CUDA_VERSION" && \
-    conda init bash && \
-    # NOTE: this requires that the channel is presented in the yaml before packages
-    # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later
-    python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
-    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
-    python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
-    python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \
-    cat environment.yml && \
-    conda env update --file environment.yml && \
-    conda clean -ya && \
-    rm environment.yml
-
-ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH
-ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
-# if you want this environment to be the default one, uncomment the following line:
-ENV CONDA_DEFAULT_ENV=${CONDA_ENV}
-
-COPY ./requirements/extra.txt requirements-extra.txt
-COPY ./requirements/test.txt requirements-tests.txt
-
 RUN \
+    wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
+    python${PYTHON_VERSION} get-pip.py && \
+    rm get-pip.py && \
+
     # Disable cache
     pip config set global.cache-dir false && \
-    #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
-    #source ~/.bashrc && \
+    # eventualy use pre-release
+    #pip install "torch==${PYTORCH_VERSION}.*" --pre && \
+    # set particular PyTorch version
+    python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \
+
+    # Install all requirements
+    pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \
+    rm -rf requirements*
+
+RUN \
     # install NVIDIA AMP
     git clone https://github.com/NVIDIA/apex && \
-    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
-    rm -rf apex && \
-    # filter only Horovod
-    python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \
-    # Install all requirements
-    MAKEFLAGS="-j$(nproc)" ; pip install -r requirements-extra.txt && \
-    pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \
-    rm requirements*
+    pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
+    rm -rf apex
 
 RUN \
     # Show what we have
     pip --version && \
-    conda info && \
     pip list && \
     python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \
     python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__"
\ No newline at end of file
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index ebad7f8d726194..f44465383a0e0a 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -14,6 +14,8 @@
 
 FROM google/cloud-sdk:slim
 
+MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+
 # CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6
 # This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below.
 ARG PYTHON_VERSION=3.7
@@ -21,6 +23,7 @@ ARG XLA_VERSION=1.6
 
 SHELL ["/bin/bash", "-c"]
 
+ARG CONDA_VERSION=4.7.12
 # for skipping configurations
 ENV DEBIAN_FRONTEND=noninteractive
 ENV CONDA_ENV=lightning
@@ -40,7 +43,7 @@ RUN apt-get update && \
     && \
 # Install conda and python.
 # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
-    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
+    curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh  && \
     chmod +x ~/miniconda.sh && \
     ~/miniconda.sh -b && \
     rm ~/miniconda.sh && \
diff --git a/dockers/conda/Dockerfile b/dockers/release/Dockerfile
similarity index 94%
rename from dockers/conda/Dockerfile
rename to dockers/release/Dockerfile
index 17ad4f9c7e269f..886e794ccdecce 100644
--- a/dockers/conda/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.5
 
 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
+MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+
 ARG LIGHTNING_VERSION=""
 
 COPY ./ ./pytorch-lightning/
@@ -37,8 +39,6 @@ RUN \
 RUN python --version && \
     pip --version && \
     pip list && \
-    conda info && \
-    conda list && \
     python -c "import pytorch_lightning as pl; print(pl.__version__)"
 
-CMD ["/bin/bash"]
+# CMD ["/bin/bash"]
diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile
index d0f7321d5fa034..4d5afa6f461d35 100644
--- a/dockers/tpu-tests/Dockerfile
+++ b/dockers/tpu-tests/Dockerfile
@@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.6
 
 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
 
+MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
+
 #SHELL ["/bin/bash", "-c"]
 
 COPY ./ ./pytorch-lightning/
diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py
index 316048d6b66f0f..ff3436e66204ca 100644
--- a/pytorch_lightning/cluster_environments/cluster_environment.py
+++ b/pytorch_lightning/cluster_environments/cluster_environment.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 class ClusterEnvironment:
 
     def __init__(self):
diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py
index 48d1e85476f1c0..44cdc2207899cf 100644
--- a/pytorch_lightning/cluster_environments/slurm_environment.py
+++ b/pytorch_lightning/cluster_environments/slurm_environment.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 import re
 from pytorch_lightning import _logger as log
diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py
index decdd0fd849cc5..d50a10a782dbba 100644
--- a/pytorch_lightning/cluster_environments/torchelastic_environment.py
+++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import os
 from pytorch_lightning import _logger as log
 from pytorch_lightning.utilities import rank_zero_warn
@@ -44,4 +45,4 @@ def master_port(self):
         return port
 
     def world_size(self):
-        return os.environ.get('WORLD_SIZE', None)
+        return os.environ.get('WORLD_SIZE')
diff --git a/requirements/devel.txt b/requirements/devel.txt
index 5d0262ec172ec3..a8c5293c8c7db6 100644
--- a/requirements/devel.txt
+++ b/requirements/devel.txt
@@ -4,7 +4,7 @@
 # install all extra dependencies for full package testing
 -r ./extra.txt
 
-# extended list of dependencies dor development and run lint and tests
+# extended list of dependencies for development and run lint and tests
 -r ./test.txt
 
 # install all extra dependencies for running examples

From 376268f01e2e4c2b5a4569ab1e1357b38b03e418 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 26 Oct 2020 12:22:09 +0100
Subject: [PATCH 2/6] Implement finalize for WandbLogger (#4341)

* wandb finish

* experiment

* upload at end of run

* changelog

* comment

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 CHANGELOG.md                       | 2 ++
 pytorch_lightning/loggers/wandb.py | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a55438a433783..cc7ec0401d93c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,6 +39,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))
 
+- Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341))
+
 ## [1.0.3] - 2020-10-20
 
 ### Added
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 4fe308ce28c44c..ca2b04d86aea8d 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -156,3 +156,8 @@ def name(self) -> Optional[str]:
     def version(self) -> Optional[str]:
         # don't create an experiment if we don't have one
         return self._experiment.id if self._experiment else self._id
+
+    def finalize(self, status: str) -> None:
+        # upload all checkpoints from saving dir
+        if self._log_model:
+            wandb.save(os.path.join(self.save_dir, "*.ckpt"))

From f07ee33db679a4b4bdcb4a2a221aa5cbb05d7b34 Mon Sep 17 00:00:00 2001
From: chaton <thomas@grid.ai>
Date: Mon, 26 Oct 2020 11:57:03 +0000
Subject: [PATCH 3/6] BUG - Wandb: Sanitize callable. (#4320)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add _sanitize_callable_params

* add call on _val if callable

* clean code formatter

* resolve pep8

* default return function name

* resolve pep8

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Update CHANGELOG.md

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 .gitignore                         |  1 +
 CHANGELOG.md                       | 16 ++++++++++++++++
 pytorch_lightning/loggers/base.py  | 25 +++++++++++++++++++++++++
 pytorch_lightning/loggers/wandb.py |  1 +
 tests/loggers/test_wandb.py        | 29 +++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+)

diff --git a/.gitignore b/.gitignore
index db35ac44c62072..fff549a7187945 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,4 @@ mlruns/
 *.ckpt
 pytorch\ lightning
 test-reports/
+wandb
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc7ec0401d93c8..9a534c6bfaf406 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,28 +11,44 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213))
 
+
 - Added plugins docs and DDPPlugin to customize ddp across all accelerators([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285))
 
+
 - Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586))
 
+
 - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162))
 
+
 ### Changed
 
+
 - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587))
 
+
 - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130))
+
+
 - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273))
 
+
+- Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320))
+
+
 ### Deprecated
 
+
 - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213))
 
+
 - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237))
 
+
 ### Removed
 
 
+
 ### Fixed
 
 - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))
diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py
index 8f728300278060..cf0b22d7d446f8 100644
--- a/pytorch_lightning/loggers/base.py
+++ b/pytorch_lightning/loggers/base.py
@@ -168,6 +168,31 @@ def _convert_params(params: Union[Dict[str, Any], Namespace]) -> Dict[str, Any]:
 
         return params
 
+    @staticmethod
+    def _sanitize_callable_params(params: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Sanitize callable params dict, e.g. ``{'a': <function_**** at 0x****>} -> {'a': 'function_****'}``.
+
+        Args:
+            params: Dictionary containing the hyperparameters
+
+        Returns:
+            dictionary with all callables sanitized
+        """
+        def _sanitize_callable(val):
+            # Give them one chance to return a value. Don't go rabbit hole of recursive call
+            if isinstance(val, Callable):
+                try:
+                    _val = val()
+                    if isinstance(_val, Callable):
+                        return val.__name__
+                    return _val
+                except Exception:
+                    return val.__name__
+            return val
+
+        return {key: _sanitize_callable(val) for key, val in params.items()}
+
     @staticmethod
     def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any]:
         """
diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index ca2b04d86aea8d..e6ce264d597bf3 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -135,6 +135,7 @@ def watch(self, model: nn.Module, log: str = 'gradients', log_freq: int = 100):
     def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None:
         params = self._convert_params(params)
         params = self._flatten_dict(params)
+        params = self._sanitize_callable_params(params)
         self.experiment.config.update(params, allow_val_change=True)
 
     @rank_zero_only
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index e87d1dff126d95..6682cfdc8830ad 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -14,6 +14,8 @@
 import os
 import pickle
 from unittest import mock
+from argparse import ArgumentParser
+import types
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
@@ -109,3 +111,30 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir):
 
     assert trainer.checkpoint_callback.dirpath == str(tmpdir / 'project' / version / 'checkpoints')
     assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'}
+
+
+def test_wandb_sanitize_callable_params(tmpdir):
+    """
+    Callback function are not serializiable. Therefore, we get them a chance to return
+    something and if the returned type is not accepted, return None.
+    """
+    opt = "--max_epochs 1".split(" ")
+    parser = ArgumentParser()
+    parser = Trainer.add_argparse_args(parent_parser=parser)
+    params = parser.parse_args(opt)
+
+    def return_something():
+        return "something"
+    params.something = return_something
+
+    def wrapper_something():
+        return return_something
+    params.wrapper_something = wrapper_something
+
+    assert isinstance(params.gpus, types.FunctionType)
+    params = WandbLogger._convert_params(params)
+    params = WandbLogger._flatten_dict(params)
+    params = WandbLogger._sanitize_callable_params(params)
+    assert params["gpus"] == '_gpus_arg_default'
+    assert params["something"] == "something"
+    assert params["wrapper_something"] == "wrapper_something"

From 7166171962985f905bad41f532582fdb3c9d050d Mon Sep 17 00:00:00 2001
From: ananthsub <ananth.subramaniam@gmail.com>
Date: Mon, 26 Oct 2020 05:34:35 -0700
Subject: [PATCH 4/6] Update ddp_plugin.py (#4363)

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 pytorch_lightning/plugins/ddp_plugin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py
index 2cb9da1981e7be..27deeeddfdb450 100644
--- a/pytorch_lightning/plugins/ddp_plugin.py
+++ b/pytorch_lightning/plugins/ddp_plugin.py
@@ -20,7 +20,7 @@ def configure_ddp(self, model, device_ids):
 
     """
 
-    def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningModule:
+    def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel:
         """
         Override to define a custom DDP implementation.
 
@@ -40,6 +40,7 @@ def configure_ddp(self, model, device_ids):
             device_ids: the list of devices available
 
         Returns:
+            the model wrapped in LightningDistributedDataParallel
 
         """
         model = LightningDistributedDataParallel(model, device_ids=device_ids, find_unused_parameters=True)

From 6abc254ae6830bcf2688f780cd12ef68e57531ce Mon Sep 17 00:00:00 2001
From: Amog Kamsetty <amogkam@users.noreply.github.com>
Date: Mon, 26 Oct 2020 07:54:38 -0700
Subject: [PATCH 5/6] [Doc] Fix on_train_batch_end description (#4330)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

* more doc fixes

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: chaton <thomas@grid.ai>
---
 pytorch_lightning/core/hooks.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
index c1851809916487..7500d1a11d440c 100644
--- a/pytorch_lightning/core/hooks.py
+++ b/pytorch_lightning/core/hooks.py
@@ -132,7 +132,7 @@ def on_train_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloade
         Called in the training loop after the batch.
 
         Args:
-            outputs: The outputs of validation_step_end(validation_step(x))
+            outputs: The outputs of training_step_end(training_step(x))
             batch: The batched data as it is returned by the training DataLoader.
             batch_idx: the index of the batch
             dataloader_idx: the index of the dataloader
@@ -156,7 +156,7 @@ def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx:
         Called in the validation loop before anything happens for that batch.
 
         Args:
-            batch: The batched data as it is returned by the training DataLoader.
+            batch: The batched data as it is returned by the validation DataLoader.
             batch_idx: the index of the batch
             dataloader_idx: the index of the dataloader
         """
@@ -168,7 +168,7 @@ def on_validation_batch_end(self, outputs: Any, batch: Any, batch_idx: int, data
 
         Args:
             outputs: The outputs of validation_step_end(validation_step(x))
-            batch: The batched data as it is returned by the training DataLoader.
+            batch: The batched data as it is returned by the validation DataLoader.
             batch_idx: the index of the batch
             dataloader_idx: the index of the dataloader
         """
@@ -179,7 +179,7 @@ def on_test_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -
         Called in the test loop before anything happens for that batch.
 
         Args:
-            batch: The batched data as it is returned by the training DataLoader.
+            batch: The batched data as it is returned by the test DataLoader.
             batch_idx: the index of the batch
             dataloader_idx: the index of the dataloader
         """
@@ -191,7 +191,7 @@ def on_test_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloader
 
         Args:
             outputs: The outputs of test_step_end(test_step(x))
-            batch: The batched data as it is returned by the training DataLoader.
+            batch: The batched data as it is returned by the test DataLoader.
             batch_idx: the index of the batch
             dataloader_idx: the index of the dataloader
         """

From 8e3faa2da1e7c64a54072fa5115a242e71c49c4b Mon Sep 17 00:00:00 2001
From: Chenglu <chenglu.she@gmail.com>
Date: Tue, 27 Oct 2020 02:08:58 +0800
Subject: [PATCH 6/6] get help from docstring (#4344)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add geting help message from docstring

* Fix pep8 issue

* Apply suggestions from code review

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>

* Apply suggestions from code review

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
---
 CHANGELOG.md                                  |  3 ++
 pytorch_lightning/utilities/argparse_utils.py | 29 +++++++++--
 tests/utilities/test_argparse_utils.py        | 50 +++++++++++++++++++
 3 files changed, 79 insertions(+), 3 deletions(-)
 create mode 100644 tests/utilities/test_argparse_utils.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9a534c6bfaf406..08e2e93b93d9a9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162))
 
 
+- Added autogenerated helptext to `Trainer.add_argparse_args`. ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344))
+
+
 ### Changed
 
 
diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py
index 57c9e23d80dc92..f3cf2e5f1b90d1 100644
--- a/pytorch_lightning/utilities/argparse_utils.py
+++ b/pytorch_lightning/utilities/argparse_utils.py
@@ -14,7 +14,7 @@
 import inspect
 import os
 from argparse import ArgumentParser, Namespace
-from typing import Union, List, Tuple, Any
+from typing import Dict, Union, List, Tuple, Any
 from pytorch_lightning.utilities import parsing
 
 
@@ -160,7 +160,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
 
     allowed_types = (str, int, float, bool)
 
-    # TODO: get "help" from docstring :)
+    args_help = parse_args_from_docstring(cls.__init__.__doc__ or cls.__doc__)
     for arg, arg_types, arg_default in (
         at for at in get_init_arguments_and_types(cls) if at[0] not in depr_arg_names
     ):
@@ -200,13 +200,36 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser:
             dest=arg,
             default=arg_default,
             type=use_type,
-            help='autogenerated by pl.Trainer',
+            help=args_help.get(arg),
             **arg_kwargs,
         )
 
     return parser
 
 
+def parse_args_from_docstring(docstring: str) -> Dict[str, str]:
+    arg_block_indent = None
+    current_arg = None
+    parsed = {}
+    for line in docstring.split("\n"):
+        stripped = line.lstrip()
+        if not stripped:
+            continue
+        line_indent = len(line) - len(stripped)
+        if stripped.startswith(('Args:', 'Arguments:', 'Parameters:')):
+            arg_block_indent = line_indent + 4
+        elif arg_block_indent is None:
+            continue
+        elif line_indent < arg_block_indent:
+            break
+        elif line_indent == arg_block_indent:
+            current_arg, arg_description = stripped.split(':', maxsplit=1)
+            parsed[current_arg] = arg_description.lstrip()
+        elif line_indent > arg_block_indent:
+            parsed[current_arg] += f' {stripped}'
+    return parsed
+
+
 def _gpus_allowed_type(x) -> Union[int, str]:
     if ',' in x:
         return str(x)
diff --git a/tests/utilities/test_argparse_utils.py b/tests/utilities/test_argparse_utils.py
new file mode 100644
index 00000000000000..978ad820482b2f
--- /dev/null
+++ b/tests/utilities/test_argparse_utils.py
@@ -0,0 +1,50 @@
+from pytorch_lightning.utilities.argparse_utils import parse_args_from_docstring
+
+
+def test_parse_args_from_docstring_normal():
+    args_help = parse_args_from_docstring(
+        """Constrain image dataset
+
+        Args:
+            root: Root directory of dataset where ``MNIST/processed/training.pt``
+                and  ``MNIST/processed/test.pt`` exist.
+            train: If ``True``, creates dataset from ``training.pt``,
+                otherwise from ``test.pt``.
+            normalize: mean and std deviation of the MNIST dataset.
+            download: If true, downloads the dataset from the internet and
+                puts it in root directory. If dataset is already downloaded, it is not
+                downloaded again.
+            num_samples: number of examples per selected class/digit
+            digits: list selected MNIST digits/classes
+
+        Examples:
+            >>> dataset = TrialMNIST(download=True)
+            >>> len(dataset)
+            300
+            >>> sorted(set([d.item() for d in dataset.targets]))
+            [0, 1, 2]
+            >>> torch.bincount(dataset.targets)
+            tensor([100, 100, 100])
+        """
+    )
+
+    expected_args = ['root', 'train', 'normalize', 'download', 'num_samples', 'digits']
+    assert len(args_help.keys()) == len(expected_args)
+    assert all([x == y for x, y in zip(args_help.keys(), expected_args)])
+    assert args_help['root'] == 'Root directory of dataset where ``MNIST/processed/training.pt``' \
+                                ' and  ``MNIST/processed/test.pt`` exist.'
+    assert args_help['normalize'] == 'mean and std deviation of the MNIST dataset.'
+
+
+def test_parse_args_from_docstring_empty():
+    args_help = parse_args_from_docstring(
+        """Constrain image dataset
+
+        Args:
+
+        Returns:
+
+        Examples:
+        """
+    )
+    assert len(args_help.keys()) == 0