From ce8abd62553a449630f992696a2a8eb02ca7f724 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 26 Oct 2020 11:47:09 +0100 Subject: [PATCH 1/6] Drone: use nightly build cuda docker images (#3658) * upgrade PT version * update docker * docker * try 1.5 * badge * fix typo: dor -> for (#3918) * prune * prune * env * echo * try * notes * env * env * env * notes * docker * prune * maintainer * CI * update * just 1.5 * CI * CI * CI * CI * CI * CI * CI * CI * CI * CI * CI * docker * CI * CI * CI * CI * CI * CI * CI * CI * CI * push * try * prune * CI * CI * CI * CI Co-authored-by: Klyukin Valeriy Co-authored-by: Jeff Yang --- .drone.yml | 34 +---- .github/workflows/ci_dockers.yml | 84 ++++++++---- .github/workflows/ci_test-conda.yml | 8 +- .github/workflows/docker-builds.yml | 4 +- .github/workflows/nightly.yml | 50 ++++---- README.md | 4 +- dockers/README.md | 25 +++- dockers/base-conda/Dockerfile | 121 ++++++++++++++++++ dockers/base-cuda/Dockerfile | 109 ++++++++-------- dockers/base-xla/Dockerfile | 5 +- dockers/{conda => release}/Dockerfile | 6 +- dockers/tpu-tests/Dockerfile | 2 + .../cluster_environment.py | 1 + .../cluster_environments/slurm_environment.py | 1 + .../torchelastic_environment.py | 3 +- requirements/devel.txt | 2 +- 16 files changed, 304 insertions(+), 155 deletions(-) create mode 100644 dockers/base-conda/Dockerfile rename dockers/{conda => release}/Dockerfile (94%) diff --git a/.drone.yml b/.drone.yml index 5c759f042e6616..bb4d8a74b28f53 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,44 +20,21 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5 environment: - SLURM_LOCALID: 0 CODECOV_TOKEN: from_secret: codecov_token MKL_THREADING_LAYER: GNU - HOROVOD_GPU_OPERATIONS: NCCL - HOROVOD_WITH_PYTORCH: 1 - HOROVOD_WITHOUT_TENSORFLOW: 1 - HOROVOD_WITHOUT_MXNET: 1 - HOROVOD_WITH_GLOO: 1 - HOROVOD_WITHOUT_MPI: 1 - - #volumes: - # # Mount pip cache from host - # - name: pip_cache - # path: /opt/conda/lib/python3.7/site-packages commands: - # todo: remove unsets as in correct image Horovod shall be set - - unset HOROVOD_GPU_ALLREDUCE - - unset HOROVOD_GPU_BROADCAST - - export PATH="$PATH:/root/.local/bin" - python --version - - pip install pip -U - pip --version - nvidia-smi - #- bash ./requirements/install_AMP.sh - - apt-get update && apt-get install -y cmake - - pip uninstall -y horovod # todo: this shall not be needed - - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed --no-cache-dir - #- pip install -r ./requirements/docs.txt --user -q - - pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed + - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir - pip list - - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')" - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8 - - python -m py.test benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 + - python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 #- cd docs; make doctest; make coverage - coverage report # see: https://docs.codecov.io/docs/merging-reports @@ -73,8 +50,3 @@ trigger: include: - push - pull_request - -#volumes: -# - name: pip_cache -# host: -# path: /tmp/cache/drone/pip diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 017d4e637bacda..c8816486f26884 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -9,7 +9,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master] jobs: - build-Conda: + build-PL: runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -21,18 +21,16 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Build Conda Docker + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 + - name: Build PL Docker # publish master uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/conda/Dockerfile + file: dockers/release/Dockerfile push: false timeout-minutes: 50 @@ -48,10 +46,8 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Build XLA Docker # publish master uses: docker/build-push-action@v2 @@ -70,24 +66,25 @@ jobs: fail-fast: false matrix: include: - #- python_version: 3.7 - # pytorch_version: 1.8 # todo - # pytorch_channel: pytorch-nightly - - python_version: 3.8 + #- python_version: 3.8 + # pytorch_version: 1.7 # todo + - python_version: 3.7 pytorch_version: 1.6 - pytorch_channel: pytorch - python_version: 3.6 - pytorch_version: 1.5 - pytorch_channel: pytorch + pytorch_version: 1.3 steps: - name: Checkout uses: actions/checkout@v2 - # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + id: extend + # https://github.com/docker/setup-buildx-action + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker # publish master uses: docker/build-push-action@v2 @@ -95,8 +92,49 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile push: false timeout-minutes: 50 + + build-conda: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + - python_version: 3.8 + pytorch_version: 1.6 + - python_version: 3.6 + pytorch_version: 1.4 + #- python_version: 3.7 + # pytorch_version: 1.8 # todo + steps: + - name: Checkout + uses: actions/checkout@v2 + + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1) + echo "::set-output name=CHANNEL::$channel" + id: extend + + # https://github.com/docker/setup-buildx-action + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 + - name: Build CUDA Docker + # publish master + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + file: dockers/base-conda/Dockerfile + push: false + timeout-minutes: 50 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 3289f5cbecf5ef..f652cbb1a4b58c 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -9,14 +9,14 @@ on: # Trigger the workflow on push or pull request, but only for the master bra jobs: conda: - runs-on: ${{ matrix.os }} - container: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} + runs-on: ubuntu-20.04 + container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} strategy: fail-fast: false matrix: - os: [ubuntu-20.04] + # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] + pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 1dc2b7c4e04953..0ba6f701f65d6c 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -8,7 +8,7 @@ on: types: [created] jobs: - build-Conda: + build-PL: runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -36,7 +36,7 @@ jobs: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - dockerfile: dockers/conda/Dockerfile + dockerfile: dockers/release/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 55 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index c91033b65f62c6..eb10c439360446 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -8,6 +8,7 @@ on: # based on https://github.com/pypa/gh-action-pypi-publish jobs: + pypi-release: runs-on: ubuntu-20.04 @@ -47,10 +48,8 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Login to DockerHub uses: docker/login-action@v1 with: @@ -78,37 +77,32 @@ jobs: matrix: python_version: [3.6, 3.7, 3.8] pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7 - pytorch_channel: ["pytorch", "pytorch-nightly"] - # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations exclude: - - pytorch_version: 1.7 - pytorch_channel: pytorch - - pytorch_version: 1.3 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.4 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.5 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.6 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.3 - pytorch_channel: pytorch - python_version: 3.8 + # excludes PT 1.3 as it is missing on pypi + - python_version: 3.8 + pytorch_version: 1.3 + steps: - name: Checkout uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Login to DockerHub uses: docker/login-action@v1 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1) + echo "::set-output name=CHANNEL::$channel" + id: extend + - name: Publish CUDA to Docker Hub # publish master uses: docker/build-push-action@v2 @@ -116,7 +110,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile @@ -131,7 +125,11 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/conda/Dockerfile + PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + file: dockers/base-conda/Dockerfile push: true - tags: pytorchlightning/pytorch_lightning:nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 55 diff --git a/README.md b/README.md index 54552367deb05a..21f4aaab19ad11 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,8 @@ Lightning can automatically export to ONNX or TorchScript for those cases. | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | | :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | -| Linux py3.7 [GPUs**] | - | - |[![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | +| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | diff --git a/dockers/README.md b/dockers/README.md index 7b3063e00f79c4..73c40635eb0a57 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,4 +1,6 @@ -## Builds +# Docker images + +## Builds images form attached Dockerfiles You can build it on your own, note it takes lots of time, be prepared. @@ -31,4 +33,23 @@ and if you do not need it anymore, just clean it: ```bash docker image list docker image rm pytorch-lightning:latest -``` \ No newline at end of file +``` + +### Run docker image with GPUs + +To run docker image with access to you GPUs you need to install +```bash +# Add the package repositories +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +sudo systemctl restart docker +``` + +and later run the docker image with `--gpus all` so for example + +``` +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 +``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile new file mode 100644 index 00000000000000..6a7f03970cf754 --- /dev/null +++ b/dockers/base-conda/Dockerfile @@ -0,0 +1,121 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Existing images: +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.8 --build-arg PYTORCH_CHANNEL=pytorch-nightly +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch + +ARG CUDNN_VERSION=8 +ARG CUDA_VERSION=10.2 + +# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 +# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 + +ARG PYTHON_VERSION=3.7 +ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_CHANNEL=pytorch +ARG CONDA_VERSION=4.7.12 + +SHELL ["/bin/bash", "-c"] + +ENV PATH="$PATH:/root/.local/bin" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + curl \ + ca-certificates \ + && \ + +# Install conda and python. +# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b && \ + rm ~/miniconda.sh && \ + +# Cleaning + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /root/.cache && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH="/root/miniconda3/bin:$PATH" +ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" + +ENV HOROVOD_GPU_OPERATIONS=NCCL +ENV HOROVOD_WITH_PYTORCH=1 +ENV HOROVOD_WITHOUT_TENSORFLOW=1 +ENV HOROVOD_WITHOUT_MXNET=1 +ENV HOROVOD_WITH_GLOO=1 +ENV HOROVOD_WITHOUT_MPI=1 +#ENV MAKEFLAGS="-j$(nproc)" +ENV MAKEFLAGS="-j1" +ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5" + +ENV CONDA_ENV=lightning +COPY environment.yml environment.yml + +# conda init +RUN conda create -y --name $CONDA_ENV && \ + conda init bash && \ + # NOTE: this requires that the channel is presented in the yaml before packages + # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later + python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \ + cat environment.yml && \ + conda env update --file environment.yml && \ + conda clean -ya && \ + rm environment.yml + +ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH +ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" +# if you want this environment to be the default one, uncomment the following line: +ENV CONDA_DEFAULT_ENV=${CONDA_ENV} + +COPY ./requirements/extra.txt requirements-extra.txt +COPY ./requirements/test.txt requirements-test.txt + +RUN \ + # Disable cache + pip config set global.cache-dir false && \ + #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ + #source ~/.bashrc && \ + # Install remaining requirements + pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \ + pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \ + rm requirements* + +RUN \ + # install NVIDIA AMP + git clone https://github.com/NVIDIA/apex && \ + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ + rm -rf apex + +RUN \ + # Show what we have + pip --version && \ + conda info && \ + pip list && \ + python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index ee892bccb289bc..e22b5a862a7d76 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,106 +13,97 @@ # limitations under the License. # Existing images: -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1 -ARG CUDNN_VERSION=7 -ARG CUDA_VERSION=10.1 +ARG CUDNN_VERSION=8 +ARG CUDA_VERSION=10.2 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -# FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu16.04 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 -# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu16.04 ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.6 -ARG PYTORCH_CHANNEL=pytorch -ARG CONDA_VERSION=4.7.12 SHELL ["/bin/bash", "-c"] +# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Prague ENV PATH="$PATH:/root/.local/bin" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ build-essential \ + pkg-config \ cmake \ git \ - curl \ + wget \ ca-certificates \ + software-properties-common \ + && \ + +# Install python + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-distutils \ + python${PYTHON_VERSION}-dev \ && \ -# Install conda and python. -# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ + + update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ + # Cleaning apt-get autoremove -y && \ apt-get clean && \ rm -rf /root/.cache && \ rm -rf /var/lib/apt/lists/* -ENV PATH="/root/miniconda3/bin:$PATH" -ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" -ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" - ENV HOROVOD_GPU_OPERATIONS=NCCL ENV HOROVOD_WITH_PYTORCH=1 ENV HOROVOD_WITHOUT_TENSORFLOW=1 ENV HOROVOD_WITHOUT_MXNET=1 ENV HOROVOD_WITH_GLOO=1 ENV HOROVOD_WITHOUT_MPI=1 +#ENV MAKEFLAGS="-j$(nproc)" +ENV MAKEFLAGS="-j1" +ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5" -ENV CONDA_ENV=lightning -COPY environment.yml environment.yml +COPY ./requirements.txt requirements.txt +COPY ./requirements/ ./requirements/ # conda init -RUN conda create -y --name $CONDA_ENV "cudatoolkit=$CUDA_VERSION" && \ - conda init bash && \ - # NOTE: this requires that the channel is presented in the yaml before packages - # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later - python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ - python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \ - cat environment.yml && \ - conda env update --file environment.yml && \ - conda clean -ya && \ - rm environment.yml - -ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH -ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" -# if you want this environment to be the default one, uncomment the following line: -ENV CONDA_DEFAULT_ENV=${CONDA_ENV} - -COPY ./requirements/extra.txt requirements-extra.txt -COPY ./requirements/test.txt requirements-tests.txt - RUN \ + wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ + python${PYTHON_VERSION} get-pip.py && \ + rm get-pip.py && \ + # Disable cache pip config set global.cache-dir false && \ - #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ - #source ~/.bashrc && \ + # eventualy use pre-release + #pip install "torch==${PYTORCH_VERSION}.*" --pre && \ + # set particular PyTorch version + python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \ + + # Install all requirements + pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \ + rm -rf requirements* + +RUN \ # install NVIDIA AMP git clone https://github.com/NVIDIA/apex && \ - pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ - rm -rf apex && \ - # filter only Horovod - python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \ - # Install all requirements - MAKEFLAGS="-j$(nproc)" ; pip install -r requirements-extra.txt && \ - pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \ - rm requirements* + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ + rm -rf apex RUN \ # Show what we have pip --version && \ - conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index ebad7f8d726194..f44465383a0e0a 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -14,6 +14,8 @@ FROM google/cloud-sdk:slim +MAINTAINER PyTorchLightning + # CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6 # This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below. ARG PYTHON_VERSION=3.7 @@ -21,6 +23,7 @@ ARG XLA_VERSION=1.6 SHELL ["/bin/bash", "-c"] +ARG CONDA_VERSION=4.7.12 # for skipping configurations ENV DEBIAN_FRONTEND=noninteractive ENV CONDA_ENV=lightning @@ -40,7 +43,7 @@ RUN apt-get update && \ && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ rm ~/miniconda.sh && \ diff --git a/dockers/conda/Dockerfile b/dockers/release/Dockerfile similarity index 94% rename from dockers/conda/Dockerfile rename to dockers/release/Dockerfile index 17ad4f9c7e269f..886e794ccdecce 100644 --- a/dockers/conda/Dockerfile +++ b/dockers/release/Dockerfile @@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.5 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +MAINTAINER PyTorchLightning + ARG LIGHTNING_VERSION="" COPY ./ ./pytorch-lightning/ @@ -37,8 +39,6 @@ RUN \ RUN python --version && \ pip --version && \ pip list && \ - conda info && \ - conda list && \ python -c "import pytorch_lightning as pl; print(pl.__version__)" -CMD ["/bin/bash"] +# CMD ["/bin/bash"] diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index d0f7321d5fa034..4d5afa6f461d35 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.6 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +MAINTAINER PyTorchLightning + #SHELL ["/bin/bash", "-c"] COPY ./ ./pytorch-lightning/ diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index 316048d6b66f0f..ff3436e66204ca 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + class ClusterEnvironment: def __init__(self): diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 48d1e85476f1c0..44cdc2207899cf 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os import re from pytorch_lightning import _logger as log diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py index decdd0fd849cc5..d50a10a782dbba 100644 --- a/pytorch_lightning/cluster_environments/torchelastic_environment.py +++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os from pytorch_lightning import _logger as log from pytorch_lightning.utilities import rank_zero_warn @@ -44,4 +45,4 @@ def master_port(self): return port def world_size(self): - return os.environ.get('WORLD_SIZE', None) + return os.environ.get('WORLD_SIZE') diff --git a/requirements/devel.txt b/requirements/devel.txt index 5d0262ec172ec3..a8c5293c8c7db6 100644 --- a/requirements/devel.txt +++ b/requirements/devel.txt @@ -4,7 +4,7 @@ # install all extra dependencies for full package testing -r ./extra.txt -# extended list of dependencies dor development and run lint and tests +# extended list of dependencies for development and run lint and tests -r ./test.txt # install all extra dependencies for running examples From 376268f01e2e4c2b5a4569ab1e1357b38b03e418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 26 Oct 2020 12:22:09 +0100 Subject: [PATCH 2/6] Implement finalize for WandbLogger (#4341) * wandb finish * experiment * upload at end of run * changelog * comment Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- CHANGELOG.md | 2 ++ pytorch_lightning/loggers/wandb.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a55438a433783..cc7ec0401d93c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) +- Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) + ## [1.0.3] - 2020-10-20 ### Added diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index 4fe308ce28c44c..ca2b04d86aea8d 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -156,3 +156,8 @@ def name(self) -> Optional[str]: def version(self) -> Optional[str]: # don't create an experiment if we don't have one return self._experiment.id if self._experiment else self._id + + def finalize(self, status: str) -> None: + # upload all checkpoints from saving dir + if self._log_model: + wandb.save(os.path.join(self.save_dir, "*.ckpt")) From f07ee33db679a4b4bdcb4a2a221aa5cbb05d7b34 Mon Sep 17 00:00:00 2001 From: chaton Date: Mon, 26 Oct 2020 11:57:03 +0000 Subject: [PATCH 3/6] BUG - Wandb: Sanitize callable. (#4320) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add _sanitize_callable_params * add call on _val if callable * clean code formatter * resolve pep8 * default return function name * resolve pep8 * Apply suggestions from code review Co-authored-by: Adrian Wälchli * Update CHANGELOG.md Co-authored-by: Sean Naren Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli --- .gitignore | 1 + CHANGELOG.md | 16 ++++++++++++++++ pytorch_lightning/loggers/base.py | 25 +++++++++++++++++++++++++ pytorch_lightning/loggers/wandb.py | 1 + tests/loggers/test_wandb.py | 29 +++++++++++++++++++++++++++++ 5 files changed, 72 insertions(+) diff --git a/.gitignore b/.gitignore index db35ac44c62072..fff549a7187945 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,4 @@ mlruns/ *.ckpt pytorch\ lightning test-reports/ +wandb diff --git a/CHANGELOG.md b/CHANGELOG.md index cc7ec0401d93c8..9a534c6bfaf406 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,28 +11,44 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) + - Added plugins docs and DDPPlugin to customize ddp across all accelerators([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) + - Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) + - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) + ### Changed + - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) + - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) + + - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) + +- Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) + + ### Deprecated + - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) + - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) + ### Removed + ### Fixed - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) diff --git a/pytorch_lightning/loggers/base.py b/pytorch_lightning/loggers/base.py index 8f728300278060..cf0b22d7d446f8 100644 --- a/pytorch_lightning/loggers/base.py +++ b/pytorch_lightning/loggers/base.py @@ -168,6 +168,31 @@ def _convert_params(params: Union[Dict[str, Any], Namespace]) -> Dict[str, Any]: return params + @staticmethod + def _sanitize_callable_params(params: Dict[str, Any]) -> Dict[str, Any]: + """ + Sanitize callable params dict, e.g. ``{'a': } -> {'a': 'function_****'}``. + + Args: + params: Dictionary containing the hyperparameters + + Returns: + dictionary with all callables sanitized + """ + def _sanitize_callable(val): + # Give them one chance to return a value. Don't go rabbit hole of recursive call + if isinstance(val, Callable): + try: + _val = val() + if isinstance(_val, Callable): + return val.__name__ + return _val + except Exception: + return val.__name__ + return val + + return {key: _sanitize_callable(val) for key, val in params.items()} + @staticmethod def _flatten_dict(params: Dict[str, Any], delimiter: str = '/') -> Dict[str, Any]: """ diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py index ca2b04d86aea8d..e6ce264d597bf3 100644 --- a/pytorch_lightning/loggers/wandb.py +++ b/pytorch_lightning/loggers/wandb.py @@ -135,6 +135,7 @@ def watch(self, model: nn.Module, log: str = 'gradients', log_freq: int = 100): def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: params = self._convert_params(params) params = self._flatten_dict(params) + params = self._sanitize_callable_params(params) self.experiment.config.update(params, allow_val_change=True) @rank_zero_only diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py index e87d1dff126d95..6682cfdc8830ad 100644 --- a/tests/loggers/test_wandb.py +++ b/tests/loggers/test_wandb.py @@ -14,6 +14,8 @@ import os import pickle from unittest import mock +from argparse import ArgumentParser +import types from pytorch_lightning import Trainer from pytorch_lightning.loggers import WandbLogger @@ -109,3 +111,30 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir): assert trainer.checkpoint_callback.dirpath == str(tmpdir / 'project' / version / 'checkpoints') assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {'epoch=0.ckpt'} + + +def test_wandb_sanitize_callable_params(tmpdir): + """ + Callback function are not serializiable. Therefore, we get them a chance to return + something and if the returned type is not accepted, return None. + """ + opt = "--max_epochs 1".split(" ") + parser = ArgumentParser() + parser = Trainer.add_argparse_args(parent_parser=parser) + params = parser.parse_args(opt) + + def return_something(): + return "something" + params.something = return_something + + def wrapper_something(): + return return_something + params.wrapper_something = wrapper_something + + assert isinstance(params.gpus, types.FunctionType) + params = WandbLogger._convert_params(params) + params = WandbLogger._flatten_dict(params) + params = WandbLogger._sanitize_callable_params(params) + assert params["gpus"] == '_gpus_arg_default' + assert params["something"] == "something" + assert params["wrapper_something"] == "wrapper_something" From 7166171962985f905bad41f532582fdb3c9d050d Mon Sep 17 00:00:00 2001 From: ananthsub Date: Mon, 26 Oct 2020 05:34:35 -0700 Subject: [PATCH 4/6] Update ddp_plugin.py (#4363) Co-authored-by: Sean Naren --- pytorch_lightning/plugins/ddp_plugin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/ddp_plugin.py b/pytorch_lightning/plugins/ddp_plugin.py index 2cb9da1981e7be..27deeeddfdb450 100644 --- a/pytorch_lightning/plugins/ddp_plugin.py +++ b/pytorch_lightning/plugins/ddp_plugin.py @@ -20,7 +20,7 @@ def configure_ddp(self, model, device_ids): """ - def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningModule: + def configure_ddp(self, model: LightningModule, device_ids: List[int]) -> LightningDistributedDataParallel: """ Override to define a custom DDP implementation. @@ -40,6 +40,7 @@ def configure_ddp(self, model, device_ids): device_ids: the list of devices available Returns: + the model wrapped in LightningDistributedDataParallel """ model = LightningDistributedDataParallel(model, device_ids=device_ids, find_unused_parameters=True) From 6abc254ae6830bcf2688f780cd12ef68e57531ce Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Mon, 26 Oct 2020 07:54:38 -0700 Subject: [PATCH 5/6] [Doc] Fix on_train_batch_end description (#4330) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix * more doc fixes Co-authored-by: Rohit Gupta Co-authored-by: Adrian Wälchli Co-authored-by: chaton --- pytorch_lightning/core/hooks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py index c1851809916487..7500d1a11d440c 100644 --- a/pytorch_lightning/core/hooks.py +++ b/pytorch_lightning/core/hooks.py @@ -132,7 +132,7 @@ def on_train_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloade Called in the training loop after the batch. Args: - outputs: The outputs of validation_step_end(validation_step(x)) + outputs: The outputs of training_step_end(training_step(x)) batch: The batched data as it is returned by the training DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader @@ -156,7 +156,7 @@ def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: Called in the validation loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -168,7 +168,7 @@ def on_validation_batch_end(self, outputs: Any, batch: Any, batch_idx: int, data Args: outputs: The outputs of validation_step_end(validation_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the validation DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -179,7 +179,7 @@ def on_test_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) - Called in the test loop before anything happens for that batch. Args: - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ @@ -191,7 +191,7 @@ def on_test_batch_end(self, outputs: Any, batch: Any, batch_idx: int, dataloader Args: outputs: The outputs of test_step_end(test_step(x)) - batch: The batched data as it is returned by the training DataLoader. + batch: The batched data as it is returned by the test DataLoader. batch_idx: the index of the batch dataloader_idx: the index of the dataloader """ From 8e3faa2da1e7c64a54072fa5115a242e71c49c4b Mon Sep 17 00:00:00 2001 From: Chenglu Date: Tue, 27 Oct 2020 02:08:58 +0800 Subject: [PATCH 6/6] get help from docstring (#4344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add geting help message from docstring * Fix pep8 issue * Apply suggestions from code review Co-authored-by: Adrian Wälchli * Apply suggestions from code review Co-authored-by: Jirka Borovec Co-authored-by: Adrian Wälchli Co-authored-by: Jirka Borovec Co-authored-by: Sean Naren --- CHANGELOG.md | 3 ++ pytorch_lightning/utilities/argparse_utils.py | 29 +++++++++-- tests/utilities/test_argparse_utils.py | 50 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 tests/utilities/test_argparse_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9a534c6bfaf406..08e2e93b93d9a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args`. ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) + + ### Changed diff --git a/pytorch_lightning/utilities/argparse_utils.py b/pytorch_lightning/utilities/argparse_utils.py index 57c9e23d80dc92..f3cf2e5f1b90d1 100644 --- a/pytorch_lightning/utilities/argparse_utils.py +++ b/pytorch_lightning/utilities/argparse_utils.py @@ -14,7 +14,7 @@ import inspect import os from argparse import ArgumentParser, Namespace -from typing import Union, List, Tuple, Any +from typing import Dict, Union, List, Tuple, Any from pytorch_lightning.utilities import parsing @@ -160,7 +160,7 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: allowed_types = (str, int, float, bool) - # TODO: get "help" from docstring :) + args_help = parse_args_from_docstring(cls.__init__.__doc__ or cls.__doc__) for arg, arg_types, arg_default in ( at for at in get_init_arguments_and_types(cls) if at[0] not in depr_arg_names ): @@ -200,13 +200,36 @@ def add_argparse_args(cls, parent_parser: ArgumentParser) -> ArgumentParser: dest=arg, default=arg_default, type=use_type, - help='autogenerated by pl.Trainer', + help=args_help.get(arg), **arg_kwargs, ) return parser +def parse_args_from_docstring(docstring: str) -> Dict[str, str]: + arg_block_indent = None + current_arg = None + parsed = {} + for line in docstring.split("\n"): + stripped = line.lstrip() + if not stripped: + continue + line_indent = len(line) - len(stripped) + if stripped.startswith(('Args:', 'Arguments:', 'Parameters:')): + arg_block_indent = line_indent + 4 + elif arg_block_indent is None: + continue + elif line_indent < arg_block_indent: + break + elif line_indent == arg_block_indent: + current_arg, arg_description = stripped.split(':', maxsplit=1) + parsed[current_arg] = arg_description.lstrip() + elif line_indent > arg_block_indent: + parsed[current_arg] += f' {stripped}' + return parsed + + def _gpus_allowed_type(x) -> Union[int, str]: if ',' in x: return str(x) diff --git a/tests/utilities/test_argparse_utils.py b/tests/utilities/test_argparse_utils.py new file mode 100644 index 00000000000000..978ad820482b2f --- /dev/null +++ b/tests/utilities/test_argparse_utils.py @@ -0,0 +1,50 @@ +from pytorch_lightning.utilities.argparse_utils import parse_args_from_docstring + + +def test_parse_args_from_docstring_normal(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + root: Root directory of dataset where ``MNIST/processed/training.pt`` + and ``MNIST/processed/test.pt`` exist. + train: If ``True``, creates dataset from ``training.pt``, + otherwise from ``test.pt``. + normalize: mean and std deviation of the MNIST dataset. + download: If true, downloads the dataset from the internet and + puts it in root directory. If dataset is already downloaded, it is not + downloaded again. + num_samples: number of examples per selected class/digit + digits: list selected MNIST digits/classes + + Examples: + >>> dataset = TrialMNIST(download=True) + >>> len(dataset) + 300 + >>> sorted(set([d.item() for d in dataset.targets])) + [0, 1, 2] + >>> torch.bincount(dataset.targets) + tensor([100, 100, 100]) + """ + ) + + expected_args = ['root', 'train', 'normalize', 'download', 'num_samples', 'digits'] + assert len(args_help.keys()) == len(expected_args) + assert all([x == y for x, y in zip(args_help.keys(), expected_args)]) + assert args_help['root'] == 'Root directory of dataset where ``MNIST/processed/training.pt``' \ + ' and ``MNIST/processed/test.pt`` exist.' + assert args_help['normalize'] == 'mean and std deviation of the MNIST dataset.' + + +def test_parse_args_from_docstring_empty(): + args_help = parse_args_from_docstring( + """Constrain image dataset + + Args: + + Returns: + + Examples: + """ + ) + assert len(args_help.keys()) == 0