From 48e0b33d56f78d2fdcdb80f35cfad08e3da93e15 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Fri, 30 Oct 2020 20:04:29 +0630
Subject: [PATCH 1/2] [Changelog] 1.0.4 (#4440)

* changelog 1.0.4

* changelog 1.0.4
---
 CHANGELOG.md | 51 +++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bf61cbfb6a094..6b170d3898172 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,78 +9,65 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
-- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213))
+- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383))
 
+- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348))
 
-- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285))
+### Changed
 
+- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405))
 
-- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586))
+### Deprecated
 
+- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336))
 
-- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162))
+### Removed
 
+### Fixed
 
-- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344))
+- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209))
 
 
-- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383))
+## [1.0.4] - 2020-10-27
+
+### Added
 
+- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213))
 
-- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656))
+- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285))
 
+- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586))
 
-- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348))
+- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162))
 
+- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344))
 
-### Changed
+- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656))
 
+### Changed
 
 - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587))
 
-
 - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130))
 
-
 - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273))
 
-
 - Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320))
 
-
-- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405))
-
-
 ### Deprecated
 
-
 - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213))
 
-
 - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237))
 
-
 - Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656))
 
-
-- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336))
-
-### Removed
-
-
-
 ### Fixed
 
-
-- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209))
-
-
 - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297))
 
-
 - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323))
 
-
 - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341))
 
 

From 0f584faa6b3520efa378f78de4e12a34f039d8f9 Mon Sep 17 00:00:00 2001
From: Jeff Yang <ydcjeff@outlook.com>
Date: Fri, 30 Oct 2020 22:12:14 +0630
Subject: [PATCH 2/2] PyTorch 1.7 Stable support (#3821)

* prepare for 1.7 support [ci skip]

* tpu [ci skip]

* test run 1.7

* all 1.7, needs to fix tests

* couple with torchvision

* windows try

* remove windows

* 1.7 is here

* on purpose fail [ci skip]

* return [ci skip]

* 1.7 docker

* back to normal [ci skip]

* change to some_val [ci skip]

* add seed [ci skip]

* 4 places [ci skip]

* fail on purpose [ci skip]

* verbose=True [ci skip]

* use filename to track

* use filename to track

* monitor epoch + changelog

* Update tests/checkpointing/test_model_checkpoint.py

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>

Co-authored-by: Sean Naren <sean.narenthiran@gmail.com>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 .github/workflows/ci_dockers.yml             |  6 +++---
 .github/workflows/ci_test-conda.yml          |  2 +-
 .github/workflows/ci_test-full.yml           |  2 +-
 .github/workflows/nightly.yml                |  2 +-
 CHANGELOG.md                                 |  2 ++
 README.md                                    | 10 +++++-----
 dockers/base-xla/Dockerfile                  |  2 +-
 pytorch_lightning/core/grads.py              |  4 ++--
 tests/checkpointing/test_model_checkpoint.py | 20 ++++++++++++++------
 tests/metrics/utils.py                       |  2 +-
 tests/models/test_grad_norm.py               |  4 ++--
 11 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml
index c8816486f2688..8e8d56b04d501 100644
--- a/.github/workflows/ci_dockers.yml
+++ b/.github/workflows/ci_dockers.yml
@@ -40,7 +40,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: [3.7]
-        xla_version: [1.6]  # todo: , "nightly"
+        xla_version: [1.6, "nightly"]
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -66,8 +66,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          #- python_version: 3.8
-          #  pytorch_version: 1.7  # todo
+          - python_version: 3.8
+            pytorch_version: 1.7
           - python_version: 3.7
             pytorch_version: 1.6
           - python_version: 3.6
diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml
index f652cbb1a4b58..b165b0a604344 100644
--- a/.github/workflows/ci_test-conda.yml
+++ b/.github/workflows/ci_test-conda.yml
@@ -16,7 +16,7 @@ jobs:
       matrix:
         # os: [ubuntu-20.04]
         python-version: [3.7]
-        pytorch-version: [1.3, 1.4, 1.5, 1.6]  # , 1.7 # todo
+        pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7]
 
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 35
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
index d74a923693e0b..05db07c2a6c34 100644
--- a/.github/workflows/ci_test-full.yml
+++ b/.github/workflows/ci_test-full.yml
@@ -89,7 +89,7 @@ jobs:
       run: |
         # python -m pip install --upgrade --user pip
         pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
-        pip install --requirement ./requirements/devel.txt --quiet --upgrade
+        pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade
         python --version
         pip --version
         pip list
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index eb10c43936044..f60ecd09dfb84 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -76,7 +76,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: [3.6, 3.7, 3.8]
-        pytorch_version: [1.3, 1.4, 1.5, 1.6]  # todo: , 1.7
+        pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7]
         exclude:
           # excludes PT 1.3 as it is missing on pypi
           - python_version: 3.8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6b170d3898172..c102be008d0f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821))
+
 - Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383))
 
 - Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348))
diff --git a/README.md b/README.md
index 21f4aaab19ad1..fea674bc396b0 100644
--- a/README.md
+++ b/README.md
@@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
 ## Continuous Integration
 <center>
 
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) |
+| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) |
 | :---: | :---: | :---: | :---: | :---: | :---: |
-| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - |
+| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
 | Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
 | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
-| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
-| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
-| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
+| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
+| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
+| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
 
 - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_
 - _\** tests run on two NVIDIA K80_
diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile
index f44465383a0e0..3eaabade428e6 100644
--- a/dockers/base-xla/Dockerfile
+++ b/dockers/base-xla/Dockerfile
@@ -110,4 +110,4 @@ RUN \
     conda info && \
     pip list && \
     python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \
-    python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.7').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__"
+    python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.8').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__"
diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py
index 2cdeaf4e59010..4ba1acf5689a7 100644
--- a/pytorch_lightning/core/grads.py
+++ b/pytorch_lightning/core/grads.py
@@ -46,11 +46,11 @@ def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]:
                 continue
 
             param_norm = float(p.grad.data.norm(norm_type))
-            norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3)
+            norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 4)
 
             all_norms.append(param_norm)
 
         total_norm = float(torch.tensor(all_norms).norm(norm_type))
-        norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3)
+        norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 4)
 
         return norms
diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py
index 3bc2ca436ec15..19705a6ebc9a2 100644
--- a/tests/checkpointing/test_model_checkpoint.py
+++ b/tests/checkpointing/test_model_checkpoint.py
@@ -365,9 +365,14 @@ def test_model_checkpoint_topk_zero(tmpdir):
 def test_model_checkpoint_topk_all(tmpdir):
     """ Test that save_top_k=-1 tracks the best models when monitor key is provided. """
     seed_everything(1000)
-    epochs = 2
-    model = EvalModelTemplate()
-    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=-1)
+    epochs = 3
+
+    class CustomModel(EvalModelTemplate):
+        def validation_epoch_end(self, outputs):
+            return {'epoch': self.current_epoch}
+
+    model = CustomModel()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", mode='max', save_top_k=-1)
     trainer = Trainer(
         default_root_dir=tmpdir,
         checkpoint_callback=checkpoint_callback,
@@ -375,10 +380,13 @@ def test_model_checkpoint_topk_all(tmpdir):
         logger=False,
     )
     trainer.fit(model)
-    assert checkpoint_callback.best_model_path == tmpdir / "epoch=1.ckpt"
-    assert checkpoint_callback.best_model_score > 0
+
+    assert checkpoint_callback.monitor == 'epoch'
+    assert checkpoint_callback.best_model_path == tmpdir / "epoch=2.ckpt"
+    assert checkpoint_callback.best_model_score == epochs - 1
+    assert len(os.listdir(tmpdir)) == len(checkpoint_callback.best_k_models) == epochs
     assert set(checkpoint_callback.best_k_models.keys()) == set(str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs))
-    assert checkpoint_callback.kth_best_model_path == tmpdir / "epoch=0.ckpt"
+    assert checkpoint_callback.kth_best_model_path == tmpdir / 'epoch=0.ckpt'
 
 
 def test_ckpt_metric_names(tmpdir):
diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py
index b946d23c79813..e7cc7c6123b48 100644
--- a/tests/metrics/utils.py
+++ b/tests/metrics/utils.py
@@ -24,7 +24,7 @@ def setup_ddp(rank, world_size):
     os.environ["MASTER_ADDR"] = 'localhost'
     os.environ['MASTER_PORT'] = '8088'
 
-    if torch.distributed.is_available():
+    if torch.distributed.is_available() and sys.platform not in ['win32', 'cygwin']:
         torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size)
 
 
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index 89d8ff89999c1..51ba7f34048e0 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -49,11 +49,11 @@ def on_after_backward(self):
             norm = np.linalg.norm(flat, self.norm_type)
             norms.append(norm)
 
-            out[prefix + name] = round(norm, 3)
+            out[prefix + name] = round(norm, 4)
 
         # handle total norm
         norm = np.linalg.norm(norms, self.norm_type)
-        out[prefix + 'total'] = round(norm, 3)
+        out[prefix + 'total'] = round(norm, 4)
         self.stored_grad_norms.append(out)