Merge remote-tracking branch 'origin/master' into TPU_device_check

# Conflicts: # CHANGELOG.md # pytorch_lightning/accelerators/tpu_backend.py # pytorch_lightning/trainer/data_loading.py # tests/models/test_tpu.py
Lightning-AI · Oct 4, 2020 · 7732fc5 · 7732fc5
2 parents f024a69 + 2c21f7d
commit 7732fc5
Show file tree

Hide file tree

Showing 129 changed files with 2,858 additions and 1,066 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -35,12 +35,8 @@ references:
       name: Build and push Docker image
       command: |
         gcloud --quiet auth configure-docker
-        cd dockers/tpu-tests
-        # TODO: How to find the GITHUB_REF in CircleCI?
-        # $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11.
-        # Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it
-        # for the GITHUB_REF so Docker can pull the latest pending code in PR.
-        if [ -z "$CI_PULL_REQUEST" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" .; else git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head && docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .; fi
+        #cd dockers/tpu-tests
+        docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" .
         docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
 
   deploy_cluster: &deploy_cluster
@@ -99,9 +95,13 @@ references:
 jobs:
 
   TPU-tests:
+    parameters:
+      python:
+        type: string
     docker:
       - image: circleci/python:3.7
     environment:
+      - PYTHON_VER: << parameters.python >>
       - MAX_CHECKS: 240
       - CHECK_SPEEP: 5
     steps:
@@ -148,19 +148,14 @@ jobs:
 
 workflows:
   version: 2
-  build:
+  tpu-tests:
     jobs:
       - build-Docs
       - TPU-tests:
-          filters:
-            branches:
-              # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4
-              #only:
-              #  # only from forks
-              #  - /^pull\/.\d+$/
-              ignore:
-                - master
-  cleanup:
+          matrix:
+            parameters:
+              python: ["3.6", "3.7"]
+  tpu-cleanup:
     triggers:
       - schedule:
           # The cron format is:

diff --git a/.drone.yml b/.drone.yml
@@ -46,7 +46,7 @@ steps:
     - pip install pip -U
     - pip --version
     - nvidia-smi
-    #- bash ./tests/install_AMP.sh
+    #- bash ./requirements/install_AMP.sh
     - apt-get update && apt-get install -y cmake
     - pip install -r ./requirements/base.txt --user -q --upgrade-strategy only-if-needed
     - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed

diff --git a/.github/ISSUE_TEMPLATE/how-to-question.md b/.github/ISSUE_TEMPLATE/how-to-question.md
@@ -9,9 +9,10 @@ assignees: ''
 
 ## ❓ Questions and Help
 
-### Before asking:   
-1. search the issues.   
-2. search the docs.    
+### Before asking: 
+1. Try to find answers to your questions in [the Lightning Forum!](https://forums.pytorchlightning.ai/)
+2. Search for similar [issues](https://github.com/PyTorchLightning/pytorch-lightning/issues).   
+3. Search the [docs](https://pytorch-lightning.readthedocs.io/en/latest/).    
 
 <!-- If you still can't find what you need: -->
 

diff --git a/.github/prepare-nightly_pkg-name.py b/.github/prepare-nightly_pkg-name.py
@@ -0,0 +1,12 @@
+import os
+import re
+
+PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
+
+PATH_SETUP = os.path.join(PATH_ROOT, 'setup.py')
+print(f"rename package '{PATH_SETUP}'")
+with open(PATH_SETUP, 'r') as fp:
+    setup = fp.read()
+setup = re.sub(r'name=[\'"]pytorch-lightning[\'"]', 'name="pytorch-lightning-nightly"', setup)
+with open(PATH_SETUP, 'w') as fp:
+    fp.write(setup)
diff --git a/.github/prepare_nightly.py → .github/prepare-nightly_version.py b/.github/prepare_nightly.py → .github/prepare-nightly_version.py
@@ -4,14 +4,6 @@
 
 PATH_ROOT = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
 
-PATH_SETUP = os.path.join(PATH_ROOT, 'setup.py')
-print(f"rename package '{PATH_SETUP}'")
-with open(PATH_SETUP, 'r') as fp:
-    setup = fp.read()
-setup = re.sub(r'name=[\'"]pytorch-lightning[\'"]', 'name="pytorch-lightning-nightly"', setup)
-with open(PATH_SETUP, 'w') as fp:
-    fp.write(setup)
-
 # get today date
 now = datetime.datetime.now()
 now_date = now.strftime("%Y%m%d")

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
@@ -14,11 +14,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python_version: [3.6, 3.8]
-        pytorch_version: [1.3, 1.5, 1.7]
-        exclude:
-          - python_version: 3.8
-            pytorch_version: 1.3
+        python_version: [3.6]
+        pytorch_version: [1.3, 1.7]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -62,6 +59,9 @@ jobs:
             pytorch_version: 1.7
             pytorch_channel: pytorch-nightly
           - python_version: 3.8
+            pytorch_version: 1.6
+            pytorch_channel: pytorch
+          - python_version: 3.6
             pytorch_version: 1.5
             pytorch_channel: pytorch
     steps:

diff --git a/.github/workflows/ci_pt-conda.yml → .github/workflows/ci_test-conda.yml b/.github/workflows/ci_pt-conda.yml → .github/workflows/ci_test-conda.yml
diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml
@@ -20,10 +20,16 @@ jobs:
   setup-build-publish-deploy:
     name: tpu-testing-job
     runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.6, 3.7]
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 50
 
     steps:
+    - name: Set IMAGETAG
+      run: echo "IMAGETAG=$(date +%s)_${{ matrix.python-version }}" >> $GITHUB_ENV
     - name: Install Go
       uses: actions/setup-go@v2
       with:
@@ -61,9 +67,9 @@ jobs:
       shell: bash
     - name: Build and Push Docker Image
       run: |
-        cd dockers/tpu-tests
-        docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
-        docker push "$IMAGE:$GITHUB_RUN_ID"
+        #cd dockers/tpu-tests
+        docker build --tag "$IMAGE:$IMAGETAG" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=${{ matrix.python-version }}" .
+        docker push "$IMAGE:$IMAGETAG"
       shell: bash
 
     - name: Install jsonnet
@@ -78,7 +84,7 @@ jobs:
 
     - name: Deploy the job on the kubernetes cluster
       run: |-
-        job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
+        job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$IMAGETAG | kubectl create -f -) && \
         job_name=${job_name#job.batch/} && \
         job_name=${job_name% created} && \
         echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
@@ -99,7 +105,7 @@ jobs:
         # First portion is the test logs. Print these to Github Action stdout.
         cat xx00 && \
         echo "Done with log retrieval attempt." && \
-        gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
+        gcloud container images delete "$IMAGE:$IMAGETAG" --force-delete-tags && \
         exit $status_code
       shell: bash
 

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
@@ -49,7 +49,9 @@ jobs:
           pip install --requirement requirements/base.txt --upgrade-strategy only-if-needed --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet
           pip install --requirement requirements/extra.txt
           pip install --requirement requirements/docs.txt
-          python --version ; pip --version ; pip list
+          python --version
+          pip --version
+          pip list
         shell: bash
 
       - name: Test Documentation
@@ -87,7 +89,9 @@ jobs:
           pip install --requirement requirements/docs.txt
           # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux
           sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures
-          python --version ; pip --version ; pip list
+          python --version
+          pip --version
+          pip list
         shell: bash
 
       - name: Make Documentation

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -23,7 +23,7 @@ jobs:
         python -m pip install --user --upgrade setuptools wheel
     - name: Build
       run: |
-        python .github/prepare_nightly.py
+        python .github/prepare-nightly_version.py
         python setup.py sdist bdist_wheel
         ls -lh dist/
 
@@ -34,12 +34,7 @@ jobs:
         user: __token__
         password: ${{ secrets.test_pypi_password }}
         repository_url: https://test.pypi.org/legacy/
-
-    - name: Publish distribution 📦 to PyPI
-      uses: pypa/gh-action-pypi-publish@master
-      with:
-        user: __token__
-        password: ${{ secrets.pypi_password }}
+        verbose: true
 
   docker-XLA:
     runs-on: ubuntu-20.04
@@ -52,7 +47,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
-      - name: Publish Master to Docker
+      - name: Publish XLA to Docker Hub
         # publish master
         uses: docker/[email protected]
         with:
@@ -88,7 +83,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
-      - name: Publish Master to Docker
+      - name: Publish CUDA to Docker Hub
         # publish master
         uses: docker/[email protected]
         with:
@@ -100,32 +95,7 @@ jobs:
           tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
         timeout-minutes: 55
 
-
-  docker-Conda:
-    runs-on: ubuntu-20.04
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: [ 3.6, 3.7, 3.8 ]
-        pytorch_version: [ 1.3, 1.4, 1.5, 1.6, 1.7 ]
-        pytorch_channel: [ "pytorch", "pytorch-nightly" ]
-        # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
-        exclude:
-          - pytorch_version: 1.7
-            pytorch_channel: pytorch
-          - pytorch_version: 1.3
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.4
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.5
-            pytorch_channel: pytorch-nightly
-          - pytorch_version: 1.6
-            pytorch_channel: pytorch-nightly
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v2
-
-      - name: Publish Master to Docker
+      - name: Publish Conda to Docker Hub
         # publish master
         uses: docker/[email protected]
         with:

diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
@@ -7,10 +7,10 @@ on:  # Trigger the workflow on push or pull request, but only for the master bra
   release:
     types: [created]
 
-# based on https://github.com/pypa/gh-action-pypi-publish
 
 jobs:
-  build:
+  # based on https://github.com/pypa/gh-action-pypi-publish
+  build-publish:
     runs-on: ubuntu-20.04
 
     steps:
@@ -35,6 +35,7 @@ jobs:
         user: __token__
         password: ${{ secrets.test_pypi_password }}
         repository_url: https://test.pypi.org/legacy/
+        verbose: true
 
     - name: Publish distribution 📦 to PyPI
       if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release'

diff --git a/.mergify.yml b/.mergify.yml
@@ -24,7 +24,7 @@ pull_request_rules:
       # no requested chnages from any reviewer
       - "#changes-requested-reviews-by=0"
       # this serves as ALL check has to pass as we have actually around 40 tests in total
-      - "#status-success>=53"
+      - "#status-success>=54"
       # this is just in case since we rely on GPU tests (note: redundand to the above)
       - status-success=continuous-integration/drone/pr
       - "status-success=ci/circleci: TPU-tests"

diff --git a/.pyrightconfig.json b/.pyrightconfig.json
@@ -35,6 +35,7 @@
     "pytorch_lightning/trainer/connectors/checkpoint_connector.py",
     "pytorch_lightning/trainer/connectors/data_connector.py",
     "pytorch_lightning/trainer/connectors/logger_connector.py",
+    "pytorch_lightning/distributed/dist.py",
     "pytorch_lightning/tuner",
     "pytorch_lightning/plugins"
   ],

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,7 +27,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added support for datamodules to save and load checkpoints when training ([#3563]https://github.com/PyTorchLightning/pytorch-lightning/pull/3563)
 
-- Added `tpu_device_exists` function to check if xla device is a TPU ([#3274](https://github.com/PyTorchLightning/pytorch-lightning/pull/3274))
+- Added support for datamodule in learning rate finder ([#3425](https://github.com/PyTorchLightning/pytorch-lightning/pull/3425))
+
+- Added `broadcast` to `TPUBackend` ([#3814](https://github.com/PyTorchLightning/pytorch-lightning/pull/3814))
 
 - Added `XLADeviceUtils` class to check XLA device type ([#3274](https://github.com/PyTorchLightning/pytorch-lightning/pull/3274))
 
@@ -51,6 +53,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - `row_log_interval` and `log_save_interval` are now based on training loop's `global_step` instead of epoch-internal batch index ([#3667](https://github.com/PyTorchLightning/pytorch-lightning/pull/3667))
 
+- Swap `torch.load` for `fsspec` load in DDP spawn backend ([#3787](https://github.com/PyTorchLightning/pytorch-lightning/pull/3787))
+
+- Swap `torch.load` for `fsspec` load in cloud_io loading ([#3692](https://github.com/PyTorchLightning/pytorch-lightning/pull/3692))
+
 ### Deprecated
 
 
@@ -59,6 +65,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Fixed `on_train_batch_start` hook to end epoch early ([#3700](https://github.com/PyTorchLightning/pytorch-lightning/pull/3700))
+
 - Fixed `num_sanity_val_steps` is clipped to `limit_val_batches` ([#2917](https://github.com/PyTorchLightning/pytorch-lightning/pull/2917))
 
 - Fixed RMSLE metric ([#3188](https://github.com/PyTorchLightning/pytorch-lightning/pull/3188))
@@ -87,9 +95,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed determinism in `DDPSpawnBackend` when using `seed_everything` in main process ([#3335](https://github.com/PyTorchLightning/pytorch-lightning/pull/3335))
 
-- Fixed `ModelCheckpoint` `period` to actually save every `period` epochs ([3630](https://github.com/PyTorchLightning/pytorch-lightning/pull/3630))
+- Fixed `ModelCheckpoint` `period` to actually save every `period` epochs ([#3630](https://github.com/PyTorchLightning/pytorch-lightning/pull/3630))
+
+- Fixed `val_progress_bar` total with `num_sanity_val_steps` ([#3751](https://github.com/PyTorchLightning/pytorch-lightning/pull/3751))
+
+- Fixed `ModelCheckpoint` with `save_top_k=-1` option not tracking the best models when a monitor metric is available ([#3735](https://github.com/PyTorchLightning/pytorch-lightning/pull/3735))
+
+- Fixed counter-intuitive error being thrown in `Accuracy` metric for zero target tensor ([#3764](https://github.com/PyTorchLightning/pytorch-lightning/pull/3764))
 
-- Fixed `ModelCheckpoint` with `save_top_k=-1` option not tracking the best models when a monitor metric is available ([3735](https://github.com/PyTorchLightning/pytorch-lightning/pull/3735))
+- Fixed aggregation of metrics ([#3517](https://github.com/PyTorchLightning/pytorch-lightning/pull/3517))
 
 ## [0.9.0] - YYYY-MM-DD
 

diff --git a/README.md b/README.md
@@ -91,6 +91,7 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en
 - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_
 - _\** tests run on two NVIDIA K80_
 - _\*** tests run on Google GKE TPUv2/3_
+- _TPU w/ py3.6/py3.7 means we support Colab and Kaggle env._
 
 </center>
 

diff --git a/benchmarks/test_parity.py b/benchmarks/test_parity.py
@@ -11,7 +11,7 @@
 
 @pytest.mark.parametrize('cls_model,max_diff', [
     (ParityModuleRNN, 0.05),
-    (ParityModuleMNIST, 0.5)
+    (ParityModuleMNIST, 0.57)
 ])
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 def test_pytorch_parity(tmpdir, cls_model, max_diff):