From 48e0b33d56f78d2fdcdb80f35cfad08e3da93e15 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 20:04:29 +0630 Subject: [PATCH 1/2] [Changelog] 1.0.4 (#4440) * changelog 1.0.4 * changelog 1.0.4 --- CHANGELOG.md | 51 +++++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf61cbfb6a094..6b170d3898172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,78 +9,65 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) +- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) +- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) -- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +### Changed +- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) -- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) +### Deprecated +- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) -- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +### Removed +### Fixed -- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) +- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) -- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) +## [1.0.4] - 2020-10-27 + +### Added +- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) -- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) +- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) -- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) +- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) -### Changed +- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +### Changed - Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) - - Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) - - Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) - - Fixed santized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) - -- W&B log in sync with Trainer step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) - - ### Deprecated - - Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) - - Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) - - Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) - -- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) - -### Removed - - - ### Fixed - -- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) - - - Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) - - Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) - - Fixed WandbLogger not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) From 0f584faa6b3520efa378f78de4e12a34f039d8f9 Mon Sep 17 00:00:00 2001 From: Jeff Yang Date: Fri, 30 Oct 2020 22:12:14 +0630 Subject: [PATCH 2/2] PyTorch 1.7 Stable support (#3821) * prepare for 1.7 support [ci skip] * tpu [ci skip] * test run 1.7 * all 1.7, needs to fix tests * couple with torchvision * windows try * remove windows * 1.7 is here * on purpose fail [ci skip] * return [ci skip] * 1.7 docker * back to normal [ci skip] * change to some_val [ci skip] * add seed [ci skip] * 4 places [ci skip] * fail on purpose [ci skip] * verbose=True [ci skip] * use filename to track * use filename to track * monitor epoch + changelog * Update tests/checkpointing/test_model_checkpoint.py Co-authored-by: Rohit Gupta Co-authored-by: Sean Naren Co-authored-by: Rohit Gupta --- .github/workflows/ci_dockers.yml | 6 +++--- .github/workflows/ci_test-conda.yml | 2 +- .github/workflows/ci_test-full.yml | 2 +- .github/workflows/nightly.yml | 2 +- CHANGELOG.md | 2 ++ README.md | 10 +++++----- dockers/base-xla/Dockerfile | 2 +- pytorch_lightning/core/grads.py | 4 ++-- tests/checkpointing/test_model_checkpoint.py | 20 ++++++++++++++------ tests/metrics/utils.py | 2 +- tests/models/test_grad_norm.py | 4 ++-- 11 files changed, 33 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index c8816486f2688..8e8d56b04d501 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -40,7 +40,7 @@ jobs: fail-fast: false matrix: python_version: [3.7] - xla_version: [1.6] # todo: , "nightly" + xla_version: [1.6, "nightly"] steps: - name: Checkout uses: actions/checkout@v2 @@ -66,8 +66,8 @@ jobs: fail-fast: false matrix: include: - #- python_version: 3.8 - # pytorch_version: 1.7 # todo + - python_version: 3.8 + pytorch_version: 1.7 - python_version: 3.7 pytorch_version: 1.6 - python_version: 3.6 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index f652cbb1a4b58..b165b0a604344 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -16,7 +16,7 @@ jobs: matrix: # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo + pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml index d74a923693e0b..05db07c2a6c34 100644 --- a/.github/workflows/ci_test-full.yml +++ b/.github/workflows/ci_test-full.yml @@ -89,7 +89,7 @@ jobs: run: | # python -m pip install --upgrade --user pip pip install --requirement requirements.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade - pip install --requirement ./requirements/devel.txt --quiet --upgrade + pip install --requirement ./requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --quiet --upgrade python --version pip --version pip list diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index eb10c43936044..f60ecd09dfb84 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -76,7 +76,7 @@ jobs: fail-fast: false matrix: python_version: [3.6, 3.7, 3.8] - pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7 + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] exclude: # excludes PT 1.3 as it is missing on pypi - python_version: 3.8 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b170d3898172..c102be008d0f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) + - Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) - Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) diff --git a/README.md b/README.md index 21f4aaab19ad1..fea674bc396b0 100644 --- a/README.md +++ b/README.md @@ -89,14 +89,14 @@ Lightning can automatically export to ONNX or TorchScript for those cases. ## Continuous Integration
-| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | +| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 | 1.7 (latest) | | :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | | Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | -| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | -| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | +| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | +| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index f44465383a0e0..3eaabade428e6 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -110,4 +110,4 @@ RUN \ conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ - python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.7').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" + python -c "import torch; ver = '$XLA_VERSION' ; ver = dict(nightly='1.8').get(ver, ver) ; assert torch.__version__[:3] == ver, torch.__version__" diff --git a/pytorch_lightning/core/grads.py b/pytorch_lightning/core/grads.py index 2cdeaf4e59010..4ba1acf5689a7 100644 --- a/pytorch_lightning/core/grads.py +++ b/pytorch_lightning/core/grads.py @@ -46,11 +46,11 @@ def grad_norm(self, norm_type: Union[float, int, str]) -> Dict[str, float]: continue param_norm = float(p.grad.data.norm(norm_type)) - norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 3) + norms[f'grad_{norm_type}_norm_{name}'] = round(param_norm, 4) all_norms.append(param_norm) total_norm = float(torch.tensor(all_norms).norm(norm_type)) - norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 3) + norms[f'grad_{norm_type}_norm_total'] = round(total_norm, 4) return norms diff --git a/tests/checkpointing/test_model_checkpoint.py b/tests/checkpointing/test_model_checkpoint.py index 3bc2ca436ec15..19705a6ebc9a2 100644 --- a/tests/checkpointing/test_model_checkpoint.py +++ b/tests/checkpointing/test_model_checkpoint.py @@ -365,9 +365,14 @@ def test_model_checkpoint_topk_zero(tmpdir): def test_model_checkpoint_topk_all(tmpdir): """ Test that save_top_k=-1 tracks the best models when monitor key is provided. """ seed_everything(1000) - epochs = 2 - model = EvalModelTemplate() - checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_top_k=-1) + epochs = 3 + + class CustomModel(EvalModelTemplate): + def validation_epoch_end(self, outputs): + return {'epoch': self.current_epoch} + + model = CustomModel() + checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="epoch", mode='max', save_top_k=-1) trainer = Trainer( default_root_dir=tmpdir, checkpoint_callback=checkpoint_callback, @@ -375,10 +380,13 @@ def test_model_checkpoint_topk_all(tmpdir): logger=False, ) trainer.fit(model) - assert checkpoint_callback.best_model_path == tmpdir / "epoch=1.ckpt" - assert checkpoint_callback.best_model_score > 0 + + assert checkpoint_callback.monitor == 'epoch' + assert checkpoint_callback.best_model_path == tmpdir / "epoch=2.ckpt" + assert checkpoint_callback.best_model_score == epochs - 1 + assert len(os.listdir(tmpdir)) == len(checkpoint_callback.best_k_models) == epochs assert set(checkpoint_callback.best_k_models.keys()) == set(str(tmpdir / f"epoch={i}.ckpt") for i in range(epochs)) - assert checkpoint_callback.kth_best_model_path == tmpdir / "epoch=0.ckpt" + assert checkpoint_callback.kth_best_model_path == tmpdir / 'epoch=0.ckpt' def test_ckpt_metric_names(tmpdir): diff --git a/tests/metrics/utils.py b/tests/metrics/utils.py index b946d23c79813..e7cc7c6123b48 100644 --- a/tests/metrics/utils.py +++ b/tests/metrics/utils.py @@ -24,7 +24,7 @@ def setup_ddp(rank, world_size): os.environ["MASTER_ADDR"] = 'localhost' os.environ['MASTER_PORT'] = '8088' - if torch.distributed.is_available(): + if torch.distributed.is_available() and sys.platform not in ['win32', 'cygwin']: torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 89d8ff89999c1..51ba7f34048e0 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -49,11 +49,11 @@ def on_after_backward(self): norm = np.linalg.norm(flat, self.norm_type) norms.append(norm) - out[prefix + name] = round(norm, 3) + out[prefix + name] = round(norm, 4) # handle total norm norm = np.linalg.norm(norms, self.norm_type) - out[prefix + 'total'] = round(norm, 3) + out[prefix + 'total'] = round(norm, 4) self.stored_grad_norms.append(out)