From bacc2081188537f98981d0855e1ed18083454447 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 15:20:56 +0200 Subject: [PATCH 01/55] init --- .../test_trainer_test_loop.py} | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) rename tests/{models/test_test_loop.py => trainer/test_trainer_test_loop.py} (95%) diff --git a/tests/models/test_test_loop.py b/tests/trainer/test_trainer_test_loop.py similarity index 95% rename from tests/models/test_test_loop.py rename to tests/trainer/test_trainer_test_loop.py index 10d8d35800e81..e8151c2c5bd1d 100644 --- a/tests/models/test_test_loop.py +++ b/tests/trainer/test_trainer_test_loop.py @@ -1,9 +1,9 @@ -import os +import pytest +import torch + import pytorch_lightning as pl -from tests.base import EvalModelTemplate import tests.base.develop_utils as tutils -import torch -import pytest +from tests.base import EvalModelTemplate @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") @@ -12,7 +12,7 @@ def test_single_gpu_test(tmpdir): model = EvalModelTemplate() trainer = pl.Trainer( - default_root_dir=os.getcwd(), + default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, @@ -43,7 +43,7 @@ def test_dp_test(tmpdir): model = EvalModelTemplate() trainer = pl.Trainer( - default_root_dir=os.getcwd(), + default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, @@ -72,7 +72,7 @@ def test_ddp_spawn_test(tmpdir): model = EvalModelTemplate() trainer = pl.Trainer( - default_root_dir=os.getcwd(), + default_root_dir=tmpdir, max_epochs=2, limit_train_batches=10, limit_val_batches=10, From b93fab4d930dd68111c1969b243b9407c25472e6 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 16:12:34 +0200 Subject: [PATCH 02/55] rename --- .github/workflows/tpu-testing.yml | 11 ++++---- tests/models/test_tpu.py | 47 ++++++++++++++++--------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index d4ab768a885dd..f5a08580ea737 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -14,6 +14,8 @@ env: GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image + MAX_CHECKS: 60 + CHECK_SPEEP: 30 jobs: setup-build-publish-deploy: @@ -82,17 +84,16 @@ jobs: job_name=${job_name% created} && \ echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \ i=0 && \ - # 30 checks spaced 30s apart = 900s total. - max_checks=30 && \ + # 60 checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what - # happened to the job in Kubernetes. If we try max_checks times and + # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. - while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep 60 && \ + sleep $CHECK_SPEEP && \ echo "JOB_NAME: $job_name" && \ echo "GKE_CLUSTER: $GKE_CLUSTER" && \ echo "GKE_ZONE: $GKE_ZONE" && \ diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 754e2652225c0..116db49d7ed77 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -23,7 +23,7 @@ @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_model_1(tmpdir): +def test_model_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -39,7 +39,7 @@ def test_base_tpu_model_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_model_idx_1(tmpdir): +def test_model_tpu_idx_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -55,7 +55,23 @@ def test_base_tpu_model_idx_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_model_8(tmpdir): +def test_model_tpu_idx_1_8(tmpdir): + """Make sure model trains on TPU.""" + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + tpu_cores=[1, 8], + limit_train_batches=0.4, + limit_val_batches=0.4 + ) + + model = EvalModelTemplate() + tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) + + +@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +def test_model_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -76,10 +92,11 @@ def long_train_loader(): model.val_dataloader = long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) + assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_16bit_model_core_1(tmpdir): +def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -97,7 +114,7 @@ def test_base_tpu_16bit_model_core_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_16bit_model_idx_core(tmpdir): +def test_model_16bit_tpu_idx_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -115,7 +132,7 @@ def test_base_tpu_16bit_model_idx_core(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_base_tpu_16bit_model_8_cores(tmpdir): +def test_model_16bit_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -158,23 +175,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_single_tpu_core_model(tmpdir): - """Test if single TPU core training works""" - model = EvalModelTemplate() - trainer = Trainer( - default_root_dir=tmpdir, - progress_bar_refresh_rate=0, - max_epochs=1, - train_percent_check=0.1, - val_percent_check=0.1, - tpu_cores=8, - ) - trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' - - -@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_multi_core_tpu_model(tmpdir): +def test_model_16bit_tpu_index_1_8(tmpdir): """Test if distributed TPU core training works""" model = EvalModelTemplate() trainer = Trainer( From 15cb65cffaf1311a0d1e8d97da464ecb01eddafb Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 16:42:33 +0200 Subject: [PATCH 03/55] tpu_core_idx --- pytorch_lightning/trainer/distrib_parts.py | 1 - tests/models/test_tpu.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 3c0461e713832..f5dd04c1bd332 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -79,7 +79,6 @@ class TrainerDPMixin(ABC): use_tpu: bool data_parallel_device_ids: ... progress_bar_callback: ... - tpu_id: Optional[int] on_colab_kaggle: str save_spawn_weights: Callable logger: ... diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 116db49d7ed77..687de6649c828 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -52,22 +52,24 @@ def test_model_tpu_idx_1(tmpdir): model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) + assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_model_tpu_idx_1_8(tmpdir): +def test_model_tpu_idx_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, - tpu_cores=[1, 8], + tpu_cores=[8, 1], limit_train_batches=0.4, limit_val_batches=0.4 ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) + assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @@ -92,7 +94,6 @@ def long_train_loader(): model.val_dataloader = long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @@ -128,6 +129,7 @@ def test_model_16bit_tpu_idx_1(tmpdir): model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False) + assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" From 5ac3c6118eda461b2e3a8cf559ecb4d54390fb73 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 16:43:51 +0200 Subject: [PATCH 04/55] idx 8 --- tests/models/test_tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 687de6649c828..4c4b11ad628f0 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -62,7 +62,7 @@ def test_model_tpu_idx_8(tmpdir): default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, - tpu_cores=[8, 1], + tpu_cores=[8], limit_train_batches=0.4, limit_val_batches=0.4 ) From 724b7b4c32f86d00736260831857c1ff2d1bdc94 Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 16:55:26 +0200 Subject: [PATCH 05/55] idxs --- tests/models/test_tpu.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 4c4b11ad628f0..461b143ee08bb 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -39,7 +39,7 @@ def test_model_tpu_cores_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_model_tpu_idx_1(tmpdir): +def test_model_tpu_index_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -56,7 +56,7 @@ def test_model_tpu_idx_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_model_tpu_idx_8(tmpdir): +def test_model_tpu_index_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -115,7 +115,7 @@ def test_model_16bit_tpu_cores_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_model_16bit_tpu_idx_1(tmpdir): +def test_model_16bit_tpu_index_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, @@ -177,7 +177,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") -def test_model_16bit_tpu_index_1_8(tmpdir): +def test_model_16bit_tpu_index_8(tmpdir): """Test if distributed TPU core training works""" model = EvalModelTemplate() trainer = Trainer( @@ -185,7 +185,7 @@ def test_model_16bit_tpu_index_1_8(tmpdir): max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, - tpu_cores=[1, 8], + tpu_cores=[8], ) trainer.fit(model) assert trainer.tpu_id is None @@ -221,7 +221,14 @@ def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id): assert Trainer(tpu_cores=tpu_cores).tpu_id == expected_tpu_id +def test_tpu_misconfiguration(): + """Test if trainer.tpu_id is set as expected""" + with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"): + Trainer(tpu_cores=[1, 8]) + + @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False) +@pytest.mark.skipif(TPU_AVAILABLE, reason="test requires missing TPU") def test_exception_when_no_tpu_found(tmpdir): """Test if exception is thrown when xla devices are not available""" model = EvalModelTemplate() From d817f0ec3390afdf3ae5572aaf58aba8d923c19f Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 17 Jul 2020 21:44:44 +0200 Subject: [PATCH 06/55] @pl_multi_process_test --- tests/models/test_tpu.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 461b143ee08bb..09806db43313c 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -2,13 +2,14 @@ from unittest.mock import patch import pytest +from torch.utils.data import DataLoader +import tests.base.develop_pipelines as tpipes from pytorch_lightning import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.base import EvalModelTemplate -import tests.base.develop_pipelines as tpipes from tests.base.datasets import TrialMNIST -from torch.utils.data import DataLoader +from tests.base.develop_utils import pl_multi_process_test try: import torch_xla @@ -23,6 +24,7 @@ @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -39,6 +41,7 @@ def test_model_tpu_cores_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_tpu_index_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -56,6 +59,7 @@ def test_model_tpu_index_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_tpu_index_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -73,6 +77,7 @@ def test_model_tpu_index_8(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -97,6 +102,7 @@ def long_train_loader(): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_16bit_tpu_cores_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -115,6 +121,7 @@ def test_model_16bit_tpu_cores_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_16bit_tpu_index_1(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -134,6 +141,7 @@ def test_model_16bit_tpu_index_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_16bit_tpu_cores_8(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( @@ -160,6 +168,7 @@ def long_train_loader(): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_early_stop_checkpoints_on_tpu(tmpdir): """Test if single TPU core training works""" model = EvalModelTemplate() @@ -170,13 +179,14 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): max_epochs=50, limit_train_batches=10, limit_val_batches=10, - tpu_cores=[8], + tpu_cores=[1], ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:1' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_model_16bit_tpu_index_8(tmpdir): """Test if distributed TPU core training works""" model = EvalModelTemplate() @@ -192,6 +202,7 @@ def test_model_16bit_tpu_index_8(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test def test_dataloaders_passed_to_fit(tmpdir): """Test if dataloaders passed to trainer works on TPU""" From be79c50a245741fb75a4097a076be9e6a270ecf7 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 18 Jul 2020 00:01:37 +0200 Subject: [PATCH 07/55] assert --- tests/models/test_tpu.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 09806db43313c..10074854eea54 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -167,6 +167,23 @@ def long_train_loader(): assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" +@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test +def test_model_16bit_tpu_index_8(tmpdir): + """Test if distributed TPU core training works""" + model = EvalModelTemplate() + trainer = Trainer( + default_root_dir=tmpdir, + max_epochs=1, + train_percent_check=0.4, + val_percent_check=0.2, + tpu_cores=[8], + ) + trainer.fit(model) + assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' + assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" + + @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test def test_early_stop_checkpoints_on_tpu(tmpdir): @@ -187,18 +204,20 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_16bit_tpu_index_8(tmpdir): - """Test if distributed TPU core training works""" +def test_early_stop_checkpoints_on_tpu(tmpdir): + """Test if single TPU core training works""" model = EvalModelTemplate() trainer = Trainer( + early_stop_callback=True, default_root_dir=tmpdir, - max_epochs=1, - train_percent_check=0.4, - val_percent_check=0.2, + progress_bar_refresh_rate=0, + max_epochs=50, + limit_train_batches=10, + limit_val_batches=10, tpu_cores=[8], ) trainer.fit(model) - assert trainer.tpu_id is None + assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @@ -238,7 +257,7 @@ def test_tpu_misconfiguration(): Trainer(tpu_cores=[1, 8]) -@patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False) +# @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False) @pytest.mark.skipif(TPU_AVAILABLE, reason="test requires missing TPU") def test_exception_when_no_tpu_found(tmpdir): """Test if exception is thrown when xla devices are not available""" From f35437f0d91bc629cfe72bf1dc5bd5504e5c44cf Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 18 Jul 2020 00:27:14 +0200 Subject: [PATCH 08/55] assert --- tests/base/develop_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 068c8294ee5cb..6dc4cb2fe893c 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -100,7 +100,7 @@ def inner_f(queue, **kwargs): try: func(**kwargs) queue.put(1) - except Exception as e: + except Exception: import traceback traceback.print_exc() queue.put(-1) @@ -109,6 +109,6 @@ def inner_f(queue, **kwargs): p.start() p.join() result = queue.get() - assert result == 1 + assert result == 1, 'expected 1, but returned %s' % result return wrapper From 773e52a1aeafa7ebd17ea709dfbae548f685e8bd Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 18 Jul 2020 01:06:01 +0200 Subject: [PATCH 09/55] deamon --- tests/base/develop_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 6dc4cb2fe893c..978235935f899 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -89,6 +89,7 @@ def init_checkpoint_callback(logger): def pl_multi_process_test(func): + """Wrapper for running multi-processing tests.""" @functools.wraps(func) def wrapper(*args, **kwargs): @@ -105,10 +106,13 @@ def inner_f(queue, **kwargs): traceback.print_exc() queue.put(-1) - p = Process(target=inner_f, args=(queue,), kwargs=kwargs) - p.start() - p.join() + proc = Process(target=inner_f, args=(queue,), kwargs=kwargs) + proc.start() + proc.join() result = queue.get() assert result == 1, 'expected 1, but returned %s' % result + proc.close() + queue.close() + return wrapper From 30cda419a8fac552274542c276f54a568571756b Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 18 Jul 2020 01:51:50 +0200 Subject: [PATCH 10/55] no close --- tests/base/develop_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 978235935f899..0b23c61c48e10 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -109,10 +109,8 @@ def inner_f(queue, **kwargs): proc = Process(target=inner_f, args=(queue,), kwargs=kwargs) proc.start() proc.join() + result = queue.get() assert result == 1, 'expected 1, but returned %s' % result - proc.close() - queue.close() - return wrapper From be98712d917b8ebb9e5c5602b97922adfb552692 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sun, 19 Jul 2020 00:34:31 +0200 Subject: [PATCH 11/55] imort --- tests/models/test_tpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 10074854eea54..dbb0a6a30f6b3 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -1,5 +1,4 @@ import os -from unittest.mock import patch import pytest from torch.utils.data import DataLoader From f93c5aadd6890e3c996a49cdef0dedad37e99a09 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sun, 19 Jul 2020 18:39:24 +0200 Subject: [PATCH 12/55] msg --- tests/models/test_tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index dbb0a6a30f6b3..2e4ef0e4afcae 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -269,5 +269,5 @@ def test_exception_when_no_tpu_found(tmpdir): tpu_cores=8, ) - with pytest.raises(MisconfigurationException, match='No TPU devices found.'): + with pytest.raises(MisconfigurationException, match='PyTorch XLA not installed.'): trainer.fit(model) From 1dab0c85246685fef27955862a7cd857c66567c9 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 21 Jul 2020 09:04:24 +0200 Subject: [PATCH 13/55] use_single_gpu --- docs/source/index.rst | 2 +- pytorch_lightning/trainer/distrib_data_parallel.py | 10 ++++------ pytorch_lightning/trainer/distrib_parts.py | 4 ++-- pytorch_lightning/trainer/evaluation_loop.py | 4 ++-- pytorch_lightning/trainer/trainer.py | 12 ++++++------ pytorch_lightning/trainer/training_loop.py | 4 ++-- tests/trainer/test_trainer.py | 2 +- 7 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 4b1b7c697a6c8..9ab0da242d330 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -93,7 +93,7 @@ PyTorch Lightning Documentation weights_loading optimizers profiler - single_gpu + use_single_gpu sequences training_tricks transfer_learning diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 63c6c9b8513ab..21019bff12cd5 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -171,8 +171,6 @@ def train_fx(trial_hparams, cluster_manager, _): try: import torch_xla - import torch_xla.core.xla_model as xm - import torch_xla.distributed.xla_multiprocessing as xmp except ImportError: XLA_AVAILABLE = False else: @@ -263,7 +261,7 @@ def set_distributed_mode(self, distributed_backend): self.use_ddp = False self.use_ddp2 = False self.use_horovod = False - self.single_gpu = False + self.use_single_gpu = False if distributed_backend is None: if self.has_horovodrun(): @@ -272,7 +270,7 @@ def set_distributed_mode(self, distributed_backend): if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: - self.single_gpu = True + self.use_single_gpu = True elif self.num_gpus > 1: rank_zero_warn('You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' @@ -283,7 +281,7 @@ def set_distributed_mode(self, distributed_backend): if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: - self.single_gpu = True + self.use_single_gpu = True self.use_dp = True elif self.num_gpus > 1: self.use_dp = True @@ -293,7 +291,7 @@ def set_distributed_mode(self, distributed_backend): if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu elif self.num_gpus == 1: - self.single_gpu = True + self.use_single_gpu = True self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index f5dd04c1bd332..63db623d91f0b 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -69,7 +69,7 @@ class TrainerDPMixin(ABC): use_ddp2: bool use_ddp: bool testing: bool - single_gpu: bool + use_single_gpu: bool root_gpu: ... amp_level: str precision: ... @@ -128,7 +128,7 @@ def copy_trainer_model_properties(self, model): m.use_ddp = self.use_ddp m.use_amp = self.use_amp m.testing = self.testing - m.single_gpu = self.single_gpu + m.use_single_gpu = self.use_single_gpu m.use_tpu = self.use_tpu m.tpu_local_core_rank = self.tpu_local_core_rank m.tpu_global_core_rank = self.tpu_global_core_rank diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index f52f9c12f3c83..add9bb24c672a 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -163,7 +163,7 @@ class TrainerEvaluationLoopMixin(ABC): use_dp: bool use_ddp2: bool use_horovod: bool - single_gpu: bool + use_single_gpu: bool data_parallel_device_ids: ... model: LightningModule num_test_batches: List[int] @@ -616,7 +616,7 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: args[0] = batch # single GPU data transfer - if self.single_gpu: + if self.use_single_gpu: # for single GPU put inputs on gpu manually root_gpu = 0 if isinstance(self.data_parallel_device_ids, list): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b774ce13e1ef0..ca81066fb4018 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -51,7 +51,7 @@ from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.trainer.configuration_validator import ConfigValidator -from pytorch_lightning import accelerator_backends +from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend # warnings to ignore in trainer warnings.filterwarnings( @@ -1094,7 +1094,7 @@ def fit( results = self.spawn_ddp_children(model) elif self.use_dp: - self.accelerator_backend = accelerator_backends.DataParallelBackend(self) + self.accelerator_backend = DataParallelBackend(self) self.accelerator_backend.setup(model) results = self.accelerator_backend.train() self.accelerator_backend.teardown() @@ -1102,19 +1102,19 @@ def fit( elif self.use_horovod: results = self.horovod_train(model) - elif self.single_gpu: - self.accelerator_backend = accelerator_backends.GPUBackend(self) + elif self.use_single_gpu: + self.accelerator_backend = GPUBackend(self) model = self.accelerator_backend.setup(model) results = self.accelerator_backend.train(model) elif self.use_tpu: - self.accelerator_backend = accelerator_backends.TPUBackend(self) + self.accelerator_backend = TPUBackend(self) self.accelerator_backend.setup() self.accelerator_backend.train(model) self.accelerator_backend.teardown() else: - self.accelerator_backend = accelerator_backends.CPUBackend(self) + self.accelerator_backend = CPUBackend(self) self.accelerator_backend.setup(model) results = self.accelerator_backend.train(model) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7f12d20151c24..a0f0144679df9 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -217,7 +217,7 @@ class TrainerTrainLoopMixin(ABC): use_dp: bool use_ddp2: bool use_horovod: bool - single_gpu: bool + use_single_gpu: bool use_tpu: bool data_parallel_device_ids: ... check_val_every_n_epoch: ... @@ -1068,7 +1068,7 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens): output = self.model.training_step(*args) # single GPU forward - elif self.single_gpu: + elif self.use_single_gpu: gpu_id = 0 if isinstance(self.data_parallel_device_ids, list): gpu_id = self.data_parallel_device_ids[0] diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index bfca63be60aa0..9eb8fc0f14051 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -897,7 +897,7 @@ def test_trainer_config(trainer_kwargs, expected): assert trainer.use_ddp2 is expected["use_ddp2"] assert trainer.num_gpus == expected["num_gpus"] assert trainer.on_gpu is expected["on_gpu"] - assert trainer.single_gpu is expected["single_gpu"] + assert trainer.use_single_gpu is expected["single_gpu"] assert trainer.num_processes == expected["num_processes"] From 910f1c4a07067ebcf6b44cd80c3f6646166970a4 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 22 Jul 2020 00:16:33 +0200 Subject: [PATCH 14/55] dataset --- tests/base/datasets.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index 5eb32656c83a3..b1991de6aa9e6 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -1,6 +1,7 @@ import logging import os import urllib.request +import time from typing import Tuple, Optional, Sequence import torch @@ -61,7 +62,14 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True, raise RuntimeError('Dataset not found.') data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME - self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file)) + # FIXME: try to fix loading + for _ in range(30): + try: + self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file)) + except Exception: + time.sleep(1) + else: + break def __getitem__(self, idx: int) -> Tuple[Tensor, int]: img = self.data[idx].float().unsqueeze(0) From 410742a386fb54365d12197d6057c12abb2a11a0 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 22 Jul 2020 00:35:57 +0200 Subject: [PATCH 15/55] idx --- tests/models/test_tpu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 2e4ef0e4afcae..b3aebda22cebe 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -59,13 +59,13 @@ def test_model_tpu_index_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_tpu_index_8(tmpdir): +def test_model_tpu_index_7(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, - tpu_cores=[8], + tpu_cores=[7], limit_train_batches=0.4, limit_val_batches=0.4 ) @@ -168,7 +168,7 @@ def long_train_loader(): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_16bit_tpu_index_8(tmpdir): +def test_model_16bit_tpu_index_7(tmpdir): """Test if distributed TPU core training works""" model = EvalModelTemplate() trainer = Trainer( @@ -176,7 +176,7 @@ def test_model_16bit_tpu_index_8(tmpdir): max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, - tpu_cores=[8], + tpu_cores=[7], ) trainer.fit(model) assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' From 2ad7b42ae023d043db6d1e9931036a9fb75a6777 Mon Sep 17 00:00:00 2001 From: Jirka Date: Wed, 22 Jul 2020 23:30:31 +0200 Subject: [PATCH 16/55] fix idx --- tests/models/test_tpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index b3aebda22cebe..d0e752c9018ba 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -179,7 +179,7 @@ def test_model_16bit_tpu_index_7(tmpdir): tpu_cores=[7], ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:7' assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" @@ -216,7 +216,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): tpu_cores=[8], ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:7' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") From 7576549c19db673dee41a825ff49511e2e4007e7 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 23 Jul 2020 00:12:21 +0200 Subject: [PATCH 17/55] dataset --- tests/base/datasets.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index b1991de6aa9e6..e55ef3484f7ea 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -63,13 +63,7 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True, data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME # FIXME: try to fix loading - for _ in range(30): - try: - self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file)) - except Exception: - time.sleep(1) - else: - break + self.data, self.targets = _try_load(os.path.join(self.cached_folder_path, data_file)) def __getitem__(self, idx: int) -> Tuple[Tensor, int]: img = self.data[idx].float().unsqueeze(0) @@ -111,6 +105,19 @@ def _download(self, data_folder: str) -> None: urllib.request.urlretrieve(url, fpath) +def _try_load(path_data, trials=30): + res = None + assert os.path.isfile(path_data) + for _ in range(trials): + try: + res = torch.load(path_data) + except Exception: + time.sleep(1) + else: + break + return res + + def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor: tensor = tensor.clone() mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device) @@ -195,7 +202,7 @@ def prepare_data(self, download: bool) -> None: for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME): path_fname = os.path.join(super().cached_folder_path, fname) assert os.path.isfile(path_fname), 'Missing cached file: %s' % path_fname - data, targets = torch.load(path_fname) + data, targets = _try_load(path_fname) data, targets = self._prepare_subset(data, targets, self.num_samples, self.digits) torch.save((data, targets), os.path.join(self.cached_folder_path, fname)) From accea7fdff536fb9b50c3c5fe5a93873a3425fcf Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 23 Jul 2020 12:08:00 +0200 Subject: [PATCH 18/55] format --- pytorch_lightning/trainer/distrib_data_parallel.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 21019bff12cd5..d2cdb532510e1 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -176,9 +176,9 @@ def train_fx(trial_hparams, cluster_manager, _): else: XLA_AVAILABLE = True -pid = os.getpid() -rng1 = np.random.RandomState(pid) -RANDOM_PORTS = rng1.randint(10000, 19999, 1000) +PID = os.getpid() +RNG1 = np.random.RandomState(PID) +RANDOM_PORTS = RNG1.randint(10000, 19999, 1000) class TrainerDDPMixin(ABC): @@ -362,7 +362,6 @@ def configure_slurm_ddp(self, num_gpu_nodes): def determine_local_rank(self): if self.is_slurm_managing_tasks: return int(os.environ['SLURM_LOCALID']) - else: return int(os.environ.get('LOCAL_RANK', 0)) From 85759bdb5185d46c9311d7940e546ebd2706567e Mon Sep 17 00:00:00 2001 From: Jirka Date: Fri, 24 Jul 2020 01:14:58 +0200 Subject: [PATCH 19/55] add pickable --- tests/base/test_datasets.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/base/test_datasets.py diff --git a/tests/base/test_datasets.py b/tests/base/test_datasets.py new file mode 100644 index 0000000000000..2b3cd603b74cd --- /dev/null +++ b/tests/base/test_datasets.py @@ -0,0 +1,19 @@ +import pickle + +import cloudpickle +import pytest + +from tests.base.datasets import MNIST, TrialMNIST, AverageDataset + + +@pytest.mark.parametrize('dataset_cls', [MNIST, TrialMNIST, AverageDataset]) +def test_pickling_dataset_mnist(tmpdir, dataset_cls): + mnist = dataset_cls() + + mnist_pickled = pickle.dumps(mnist) + mnist_loaded = pickle.loads(mnist_pickled) + # assert vars(mnist) == vars(mnist_loaded) + + mnist_pickled = cloudpickle.dumps(mnist) + mnist_loaded = cloudpickle.loads(mnist_pickled) + # assert vars(mnist) == vars(mnist_loaded) From c8e7a7032397e507b612c22f39a23e5ad2d57ae6 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 25 Jul 2020 22:08:11 +0200 Subject: [PATCH 20/55] typo --- tests/models/test_tpu.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index d0e752c9018ba..92c336e0acf39 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -59,20 +59,20 @@ def test_model_tpu_index_1(tmpdir): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_tpu_index_7(tmpdir): +def test_model_tpu_index_5(tmpdir): """Make sure model trains on TPU.""" trainer_options = dict( default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, - tpu_cores=[7], + tpu_cores=[5], limit_train_batches=0.4, limit_val_batches=0.4 ) model = EvalModelTemplate() tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:8' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @@ -168,18 +168,19 @@ def long_train_loader(): @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test -def test_model_16bit_tpu_index_7(tmpdir): +def test_model_16bit_tpu_index_5(tmpdir): """Test if distributed TPU core training works""" model = EvalModelTemplate() trainer = Trainer( default_root_dir=tmpdir, + precision=16, max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, - tpu_cores=[7], + tpu_cores=[5], ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:7' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" @@ -213,10 +214,10 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): max_epochs=50, limit_train_batches=10, limit_val_batches=10, - tpu_cores=[8], + tpu_cores=[5], ) trainer.fit(model) - assert torch_xla._XLAC._xla_get_default_device() == 'xla:7' + assert torch_xla._XLAC._xla_get_default_device() == 'xla:5' @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") From 19919a35b4e0fb5904f4a447854ab4ac142c884a Mon Sep 17 00:00:00 2001 From: Jirka Date: Sun, 26 Jul 2020 00:12:51 +0200 Subject: [PATCH 21/55] apex --- pytorch_lightning/accelerator_backends/gpu_backend.py | 5 +++-- pytorch_lightning/accelerator_backends/tpu_backend.py | 11 +++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/gpu_backend.py b/pytorch_lightning/accelerator_backends/gpu_backend.py index 81128e1009425..72b9002ca5668 100644 --- a/pytorch_lightning/accelerator_backends/gpu_backend.py +++ b/pytorch_lightning/accelerator_backends/gpu_backend.py @@ -14,6 +14,7 @@ import torch +from pytorch_lightning import LightningModule try: from apex import amp except ImportError: @@ -45,7 +46,7 @@ def setup(self, model): # TODO: remove with dropping NVIDIA AMP support native_amp_available = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast") - if self.trainer.use_amp and not native_amp_available: + if APEX_AVAILABLE and self.trainer.use_amp and not native_amp_available: model = self._setup_nvidia_apex(model) return model @@ -53,7 +54,7 @@ def train(self, model): results = self.trainer.run_pretrain_routine(model) return results - def _setup_nvidia_apex(self, model): + def _setup_nvidia_apex(self, model: LightningModule): model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 27b6ce0f92416..ee4088e5e7308 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -15,8 +15,7 @@ import os from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning import _logger as log - +from pytorch_lightning import _logger as log, LightningModule try: import torch_xla @@ -48,7 +47,7 @@ def teardown(self): # when training completes, load the weights back in main process self.__load_weights_on_main_process() - def train(self, model): + def train(self, model: LightningModule): self.trainer.model = model # train @@ -71,7 +70,7 @@ def __load_weights_on_main_process(self): self.trainer.model = model - def tpu_train_in_process(self, tpu_core_idx, model): + def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule): """ Here we are inside each individual process """ @@ -88,14 +87,14 @@ def tpu_train_in_process(self, tpu_core_idx, model): # save weights at the end of training self.__save_end_of_training_weights(model) - def __save_end_of_training_weights(self, model): + def __save_end_of_training_weights(self, model: LightningModule): # when training ends on these platforms dump weights to get out of the main process if self.trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') self.trainer.save_spawn_weights(model) - def __setup_tpu_training(self, model): + def __setup_tpu_training(self, model: LightningModule): # use the default device from the process tpu_device = xm.xla_device() From 4ed34a6aa78901ae222740b93f29e51cd7e5cc80 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sun, 26 Jul 2020 00:22:59 +0200 Subject: [PATCH 22/55] typo --- pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +- tests/base/datasets.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index ee4088e5e7308..c66f4726b2219 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -37,7 +37,7 @@ def setup(self): rank_zero_info(f'training on {self.trainer.tpu_cores} TPU cores') if not XLA_AVAILABLE: - raise MisconfigurationException('No TPU devices found.') + raise MisconfigurationException('PyTorch XLA not installed.') # COLAB_GPU is an env var available by default in Colab environments. self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn' diff --git a/tests/base/datasets.py b/tests/base/datasets.py index e55ef3484f7ea..e9b630d1da7c9 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -105,14 +105,14 @@ def _download(self, data_folder: str) -> None: urllib.request.urlretrieve(url, fpath) -def _try_load(path_data, trials=30): +def _try_load(path_data, trials: int = 30, delta: float = 0.3): res = None assert os.path.isfile(path_data) for _ in range(trials): try: res = torch.load(path_data) except Exception: - time.sleep(1) + time.sleep(delta) else: break return res From b11c6bc424339ad61f85d0b0eacc735e1663cb2a Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 09:31:22 +0200 Subject: [PATCH 23/55] wip --- .../accelerator_backends/ddp_spawn_backend.py | 6 ++-- .../accelerator_backends/tpu_backend.py | 31 ++++++++++++++++++- .../trainer/distrib_data_parallel.py | 8 ++--- pytorch_lightning/trainer/trainer.py | 13 ++++---- tests/base/develop_utils.py | 2 +- tests/models/test_tpu.py | 4 +-- 6 files changed, 47 insertions(+), 17 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index 7cd05fea87ca4..c23d12dbc5e8b 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -59,13 +59,13 @@ def teardown(self, model): self.trainer.model = model return results - def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0): + def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0): """ Entry point for ddp Args: process_idx: - q: + mp_queue: model: is_master: proc_offset: @@ -166,7 +166,7 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0): model = self.trainer.get_model() # persist info in ddp_spawn - self.trainer.transfer_ddp_spawn_state_on_fit_end(model, q, results) + self.trainer.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache() diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index c66f4726b2219..e6a60581431f6 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -61,6 +61,35 @@ def train(self, model: LightningModule): start_method=self.start_method ) + def _run_spawn(self, model, nprocs): + + # pass in a state q + smp = xmp.get_context('spawn') + queue = smp.SimpleQueue() + + xmp.spawn( + self.tpu_train_in_process, + args=(model, queue), + nprocs=self.trainer.tpu_cores, + start_method=self.start_method + ) + + # restore main state with best weights + best_path = queue.get() + results = queue.get() + last_path = queue.get() + + # transfer back the best path to the trainer + self.checkpoint_callback.best_model_path = best_path + + # load last weights + if last_path is not None and not self.testing: + ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) + + self.model = model + return results + def __load_weights_on_main_process(self): model = self.trainer.model @@ -70,7 +99,7 @@ def __load_weights_on_main_process(self): self.trainer.model = model - def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule): + def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue): """ Here we are inside each individual process """ diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index d2cdb532510e1..1175dbfa3c223 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -473,18 +473,18 @@ def spawn_ddp_children(self, model): sleep(delay) local_rank = 0 - results = self.ddp_train(local_rank, q=None, model=model, is_master=True) + results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True) del os.environ['WORLD_SIZE'] return results - def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0): + def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0): """ Entry point for ddp Args: process_idx: - q: + mp_queue: multiprocessing queue model: is_master: proc_offset: @@ -577,7 +577,7 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0): model = self.get_model() # persist info in ddp_spawn - self.transfer_ddp_spawn_state_on_fit_end(model, q, results) + self.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ca81066fb4018..d4ae6f68adc60 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -51,7 +51,8 @@ from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.trainer.configuration_validator import ConfigValidator -from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend +from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, \ + DataParallelBackend # warnings to ignore in trainer warnings.filterwarnings( @@ -1061,7 +1062,7 @@ def fit( elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) - self.ddp_train(process_idx=task, q=None, model=model) + self.ddp_train(process_idx=task, mp_queue=None, model=model) elif self.use_ddp: @@ -1070,21 +1071,21 @@ def fit( if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) - self.ddp_train(process_idx=task, q=None, model=model) + self.ddp_train(process_idx=task, mp_queue=None, model=model) # torchelastic or general non_slurm ddp elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ): task = int(os.environ['LOCAL_RANK']) - self.ddp_train(process_idx=task, q=None, model=model) + self.ddp_train(process_idx=task, mp_queue=None, model=model) elif self.distributed_backend == 'ddp_cpu': - self.accelerator_backend = accelerator_backends.DDPSpawnBackend(self) + self.accelerator_backend = DDPSpawnBackend(self) self.accelerator_backend.setup() self.accelerator_backend.train(model, nprocs=self.num_processes) results = self.accelerator_backend.teardown(model) elif self.distributed_backend == 'ddp_spawn': - self.accelerator_backend = accelerator_backends.DDPSpawnBackend(self) + self.accelerator_backend = DDPSpawnBackend(self) self.accelerator_backend.setup() self.accelerator_backend.train(model, nprocs=self.num_processes) results = self.accelerator_backend.teardown(model) diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py index 0b23c61c48e10..ada745951494c 100644 --- a/tests/base/develop_utils.py +++ b/tests/base/develop_utils.py @@ -60,7 +60,7 @@ def get_data_path(expt_logger, path_dir=None): return path_expt -def load_model_from_checkpoint(logger, root_weights_dir, module_class=EvalModelTemplate, path_expt=None): +def load_model_from_checkpoint(logger, root_weights_dir, module_class=EvalModelTemplate): trained_model = module_class.load_from_checkpoint(root_weights_dir) assert trained_model is not None, 'loading model failed' return trained_model diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 92c336e0acf39..b5377a3af3f66 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -92,7 +92,7 @@ def test_model_tpu_cores_8(tmpdir): # 8 cores needs a big dataset def long_train_loader(): - dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32) + dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32) return dataset model.train_dataloader = long_train_loader model.val_dataloader = long_train_loader @@ -157,7 +157,7 @@ def test_model_16bit_tpu_cores_8(tmpdir): # 8 cores needs a big dataset def long_train_loader(): - dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32) + dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32) return dataset model.train_dataloader = long_train_loader model.val_dataloader = long_train_loader From dee7fa50dfbe308392d15feca340b43faefbb2b1 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 09:58:40 +0200 Subject: [PATCH 24/55] wip --- .../accelerator_backends/ddp_spawn_backend.py | 17 +++--- .../accelerator_backends/tpu_backend.py | 56 ++++++++----------- .../trainer/distrib_data_parallel.py | 16 +++--- 3 files changed, 40 insertions(+), 49 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py index c23d12dbc5e8b..6aee68f6634f2 100644 --- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py +++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py @@ -30,26 +30,27 @@ class DDPSpawnBackend(object): def __init__(self, trainer): self.trainer = trainer - self.q = None + self.mp_queue = None def setup(self): self.trainer.set_random_port() # pass in a state q smp = mp.get_context('spawn') - self.q = smp.SimpleQueue() + self.mp_queue = smp.SimpleQueue() def train(self, model, nprocs): - mp.spawn(self.ddp_train, nprocs=nprocs, args=(self.q, model,)) + mp.spawn(self.ddp_train, nprocs=nprocs, args=(self.mp_queue, model,)) def teardown(self, model): # restore main state with best weights - best_path = self.q.get() - results = self.q.get() - last_path = self.q.get() + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() # transfer back the best path to the trainer self.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also bets score # load last weights if last_path is not None and not self.trainer.testing: @@ -65,7 +66,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 Args: process_idx: - mp_queue: + mp_queue: multiprocessing queue model: is_master: proc_offset: @@ -166,7 +167,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 model = self.trainer.get_model() # persist info in ddp_spawn - self.trainer.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results) + self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache() diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index e6a60581431f6..da74c7117ee1d 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -13,6 +13,9 @@ # limitations under the License. import os + +import torch + from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning import _logger as log, LightningModule @@ -32,6 +35,7 @@ class TPUBackend(object): def __init__(self, trainer): self.trainer = trainer self.start_method = None + self.mp_queue = None def setup(self): rank_zero_info(f'training on {self.trainer.tpu_cores} TPU cores') @@ -42,10 +46,23 @@ def setup(self): # COLAB_GPU is an env var available by default in Colab environments. self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn' + # pass in a state q + smp = xmp.get_context(self.start_method) + self.mp_queue = smp.SimpleQueue() + def teardown(self): + # restore main state with best weights + best_path = self.mp_queue.get() + results = self.mp_queue.get() + last_path = self.mp_queue.get() + + # transfer back the best path to the trainer + self.trainer.checkpoint_callback.best_model_path = best_path + # todo, pass also bets score # when training completes, load the weights back in main process self.__load_weights_on_main_process() + return results def train(self, model: LightningModule): self.trainer.model = model @@ -56,40 +73,11 @@ def train(self, model: LightningModule): else: xmp.spawn( self.tpu_train_in_process, - args=(model,), + args=(model, self.mp_queue), nprocs=self.trainer.tpu_cores, start_method=self.start_method ) - def _run_spawn(self, model, nprocs): - - # pass in a state q - smp = xmp.get_context('spawn') - queue = smp.SimpleQueue() - - xmp.spawn( - self.tpu_train_in_process, - args=(model, queue), - nprocs=self.trainer.tpu_cores, - start_method=self.start_method - ) - - # restore main state with best weights - best_path = queue.get() - results = queue.get() - last_path = queue.get() - - # transfer back the best path to the trainer - self.checkpoint_callback.best_model_path = best_path - - # load last weights - if last_path is not None and not self.testing: - ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt) - - self.model = model - return results - def __load_weights_on_main_process(self): model = self.trainer.model @@ -99,7 +87,7 @@ def __load_weights_on_main_process(self): self.trainer.model = model - def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue): + def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue=None): """ Here we are inside each individual process """ @@ -111,13 +99,15 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_que self.__setup_tpu_training(model) # Run the pretrain routine - self.trainer.run_pretrain_routine(model) + results = self.trainer.run_pretrain_routine(model) # save weights at the end of training self.__save_end_of_training_weights(model) - def __save_end_of_training_weights(self, model: LightningModule): + # persist info in spawn + self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) + def __save_end_of_training_weights(self, model: LightningModule): # when training ends on these platforms dump weights to get out of the main process if self.trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 1175dbfa3c223..10a1ea6b768d2 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -577,7 +577,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 model = self.get_model() # persist info in ddp_spawn - self.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results) + self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) # clean up memory torch.cuda.empty_cache() @@ -585,7 +585,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 if self.global_rank == 0 and self.distributed_backend not in ['ddp_spawn', 'ddp_cpu']: return results - def transfer_ddp_spawn_state_on_fit_end(self, model, q, results): + def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.distributed_backend not in ['ddp_spawn', 'ddp_cpu', 'tpu']: return @@ -594,17 +594,17 @@ def transfer_ddp_spawn_state_on_fit_end(self, model, q, results): if self.checkpoint_callback is not None: best_model_path = self.checkpoint_callback.best_model_path - if self.global_rank == 0 and q is not None: + if self.global_rank == 0 and mp_queue is not None: rank_zero_warn('cleaning up ddp environment...') - q.put(best_model_path) - q.put(results) + mp_queue.put(best_model_path) + mp_queue.put(results) # save the last weights last_path = None if not self.testing and best_model_path is not None and len(best_model_path) > 0: last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path) torch.save(model.state_dict(), last_path) - q.put(last_path) + mp_queue.put(last_path) def save_spawn_weights(self, model): """ @@ -613,7 +613,7 @@ def save_spawn_weights(self, model): :return: """ if self.is_global_zero: - path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt') + path = os.path.join(self.default_root_dir, '__temp_weight_distributed_end.ckpt') self.save_checkpoint(path) return path @@ -629,7 +629,7 @@ def load_spawn_weights(self, original_model): if self.is_global_zero: # load weights saved in ddp - path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt') + path = os.path.join(self.default_root_dir, '__temp_weight_distributed_end.ckpt') loaded_model = original_model.__class__.load_from_checkpoint(path) # copy loaded weights to old model From e2ece1bd414799fce54fb165c6bbe9958b740b07 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 10:38:54 +0200 Subject: [PATCH 25/55] wip --- .../accelerator_backends/tpu_backend.py | 12 ++++++++++-- pytorch_lightning/trainer/trainer.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index da74c7117ee1d..a7bf4ddc4dc9c 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -16,6 +16,7 @@ import torch +import torch.multiprocessing as mp from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning import _logger as log, LightningModule @@ -47,10 +48,10 @@ def setup(self): self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn' # pass in a state q - smp = xmp.get_context(self.start_method) + smp = mp.get_context('spawn') self.mp_queue = smp.SimpleQueue() - def teardown(self): + def teardown(self, model): # restore main state with best weights best_path = self.mp_queue.get() results = self.mp_queue.get() @@ -60,6 +61,13 @@ def teardown(self): self.trainer.checkpoint_callback.best_model_path = best_path # todo, pass also bets score + # load last weights + if last_path is not None and not self.trainer.testing: + ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) + + self.trainer.model = model + # when training completes, load the weights back in main process self.__load_weights_on_main_process() return results diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d4ae6f68adc60..4296ec86b5e96 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -1112,7 +1112,7 @@ def fit( self.accelerator_backend = TPUBackend(self) self.accelerator_backend.setup() self.accelerator_backend.train(model) - self.accelerator_backend.teardown() + self.accelerator_backend.teardown(model) else: self.accelerator_backend = CPUBackend(self) From 6c11daeeaa248c4aa2482514a899fe8638c9d21f Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 11:02:57 +0200 Subject: [PATCH 26/55] wip --- pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++-- pytorch_lightning/trainer/trainer.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index a7bf4ddc4dc9c..2e72ea60d3f3d 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -17,9 +17,10 @@ import torch import torch.multiprocessing as mp +from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning import _logger as log, LightningModule +from pytorch_lightning import _logger as log try: import torch_xla @@ -48,7 +49,7 @@ def setup(self): self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn' # pass in a state q - smp = mp.get_context('spawn') + smp = mp.get_context(self.start_method) self.mp_queue = smp.SimpleQueue() def teardown(self, model): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 4296ec86b5e96..a2181c0bca6cc 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -51,8 +51,8 @@ from pytorch_lightning.utilities.debugging import InternalDebugger from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.trainer.configuration_validator import ConfigValidator -from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, \ - DataParallelBackend +from pytorch_lightning.accelerator_backends import ( + GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, DataParallelBackend) # warnings to ignore in trainer warnings.filterwarnings( From c21628399995cd69cfcbca1cbaddfe1d3549f93a Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 12:23:43 +0200 Subject: [PATCH 27/55] wip --- pytorch_lightning/accelerator_backends/tpu_backend.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 2e72ea60d3f3d..2f3436567a566 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -63,9 +63,9 @@ def teardown(self, model): # todo, pass also bets score # load last weights - if last_path is not None and not self.trainer.testing: - ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) - model.load_state_dict(ckpt) + # if last_path is not None and not self.trainer.testing: + # ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) + # model.load_state_dict(ckpt) self.trainer.model = model From 4f726a7fdc74d074d11c5c52a04abf531d0174bd Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 13:54:01 +0200 Subject: [PATCH 28/55] wip --- pytorch_lightning/accelerator_backends/tpu_backend.py | 8 +++++--- pytorch_lightning/trainer/distrib_data_parallel.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 2f3436567a566..dbb54124a63e3 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -63,9 +63,9 @@ def teardown(self, model): # todo, pass also bets score # load last weights - # if last_path is not None and not self.trainer.testing: - # ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) - # model.load_state_dict(ckpt) + if last_path is not None and not self.trainer.testing: + ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) + model.load_state_dict(ckpt) self.trainer.model = model @@ -113,6 +113,8 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_que # save weights at the end of training self.__save_end_of_training_weights(model) + # for some reason the state is missing + self.trainer.distributed_backend = 'TPU' # persist info in spawn self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 10a1ea6b768d2..f8305136b7297 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -586,7 +586,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0 return results def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): - if self.distributed_backend not in ['ddp_spawn', 'ddp_cpu', 'tpu']: + if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']: return # track the best model path @@ -596,6 +596,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results): if self.global_rank == 0 and mp_queue is not None: rank_zero_warn('cleaning up ddp environment...') + # todo, pass complete checkpoint as state dictionary mp_queue.put(best_model_path) mp_queue.put(results) From 91d1f5600547b20ada76ae08b09eb867e2ab95da Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 14:41:13 +0200 Subject: [PATCH 29/55] wip --- .../accelerator_backends/tpu_backend.py | 64 ++++++++++--------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index dbb54124a63e3..37feeafd33a82 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -82,7 +82,7 @@ def train(self, model: LightningModule): else: xmp.spawn( self.tpu_train_in_process, - args=(model, self.mp_queue), + args=(model, self.trainer, self.mp_queue), nprocs=self.trainer.tpu_cores, start_method=self.start_method ) @@ -96,67 +96,69 @@ def __load_weights_on_main_process(self): self.trainer.model = model - def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue=None): + def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, trainer=None, mp_queue=None): """ Here we are inside each individual process """ - if not self.trainer.testing: - self.trainer.setup('fit') + if not trainer: + trainer = self.trainer + if not trainer.testing: + trainer.setup('fit') model.setup('fit') # setup TPU training - self.__setup_tpu_training(model) + self.__setup_tpu_training(model, trainer) # Run the pretrain routine - results = self.trainer.run_pretrain_routine(model) + results = trainer.run_pretrain_routine(model) # save weights at the end of training - self.__save_end_of_training_weights(model) + self.__save_end_of_training_weights(model, trainer) - # for some reason the state is missing - self.trainer.distributed_backend = 'TPU' # persist info in spawn - self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) + trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - def __save_end_of_training_weights(self, model: LightningModule): + @staticmethod + def __save_end_of_training_weights(model: LightningModule, trainer=None): # when training ends on these platforms dump weights to get out of the main process - if self.trainer.on_colab_kaggle: + if trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') - self.trainer.save_spawn_weights(model) + trainer.save_spawn_weights(model) - def __setup_tpu_training(self, model: LightningModule): + @staticmethod + def __setup_tpu_training(model: LightningModule, trainer=None): # use the default device from the process tpu_device = xm.xla_device() # if given an ordinal device, use this as the device - if self.trainer.tpu_id is not None: - tpu_device = xm.xla_device(self.trainer.tpu_id) + if trainer.tpu_id is not None: + tpu_device = xm.xla_device(trainer.tpu_id) # track the device and move model to it - self.trainer._device = tpu_device - model.to(self.trainer._device) + trainer._device = tpu_device + model.to(trainer._device) # get the appropriate tpu ranks - self.trainer.tpu_local_core_rank = xm.get_local_ordinal() - self.trainer.tpu_global_core_rank = xm.get_ordinal() + trainer.tpu_local_core_rank = xm.get_local_ordinal() + trainer.tpu_global_core_rank = xm.get_ordinal() # avoid duplicating progress bar - if self.trainer.tpu_global_core_rank != 0 and self.trainer.progress_bar_callback is not None: - self.trainer.progress_bar_callback.disable() + if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None: + trainer.progress_bar_callback.disable() - self.trainer.global_rank = self.trainer.tpu_local_core_rank - rank_zero_only.rank = self.trainer.global_rank + trainer.global_rank = trainer.tpu_local_core_rank + rank_zero_only.rank = trainer.global_rank # CHOOSE OPTIMIZER # allow for lr schedulers as well - optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model) - self.trainer.optimizers = optimizers - self.trainer.lr_schedulers = lr_schedulers - self.trainer.optimizer_frequencies = optimizer_frequencies + optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(model) + trainer.optimizers = optimizers + trainer.lr_schedulers = lr_schedulers + trainer.optimizer_frequencies = optimizer_frequencies # init 16 bit for TPU - if self.trainer.precision == 16: + if trainer.precision == 16: os.environ['XLA_USE_BF16'] = str(1) - log.info(f'INIT TPU local core: {self.trainer.tpu_local_core_rank},' - f' global rank: {self.trainer.tpu_global_core_rank}') + log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},' + f' global rank: {trainer.tpu_global_core_rank}') From d6f1137a52d792c1365264651dddd80a75abc849 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 14:51:41 +0200 Subject: [PATCH 30/55] wip --- pytorch_lightning/accelerator_backends/tpu_backend.py | 8 ++++---- tests/models/test_tpu.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 37feeafd33a82..6238fadee4a56 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -118,15 +118,15 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine # persist info in spawn trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - @staticmethod - def __save_end_of_training_weights(model: LightningModule, trainer=None): + @classmethod + def __save_end_of_training_weights(cls, model: LightningModule, trainer=None): # when training ends on these platforms dump weights to get out of the main process if trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') trainer.save_spawn_weights(model) - @staticmethod - def __setup_tpu_training(model: LightningModule, trainer=None): + @classmethod + def __setup_tpu_training(cls, model: LightningModule, trainer=None): # use the default device from the process tpu_device = xm.xla_device() diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index b5377a3af3f66..397714820f54f 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -163,7 +163,6 @@ def long_train_loader(): model.val_dataloader = long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False) - assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables" @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") From 9465404df9e41d6ec7319fb55b42888bc12a1266 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 15:11:07 +0200 Subject: [PATCH 31/55] docs --- docs/source/index.rst | 2 +- pytorch_lightning/__init__.py | 2 +- pytorch_lightning/accelerator_backends/gpu_backend.py | 2 +- pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++-- pytorch_lightning/core/__init__.py | 6 +++++- pytorch_lightning/core/decorators.py | 2 -- tests/trainer/test_trainer.py | 2 +- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 9ab0da242d330..4b1b7c697a6c8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -93,7 +93,7 @@ PyTorch Lightning Documentation weights_loading optimizers profiler - use_single_gpu + single_gpu sequences training_tricks transfer_learning diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index ee49879ee3665..5cb42370c28db 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -54,11 +54,11 @@ # We are not importing the rest of the lightning during the build process, as it may not be compiled yet else: from pytorch_lightning.core import LightningDataModule, LightningModule, data_loader + from pytorch_lightning.core.step_result import TrainResult, EvalResult from pytorch_lightning.callbacks import Callback from pytorch_lightning.trainer import Trainer from pytorch_lightning.utilities.seed import seed_everything from pytorch_lightning import metrics - from pytorch_lightning.core.step_result import TrainResult, EvalResult __all__ = [ 'Trainer', diff --git a/pytorch_lightning/accelerator_backends/gpu_backend.py b/pytorch_lightning/accelerator_backends/gpu_backend.py index 72b9002ca5668..3b5f37671d9e8 100644 --- a/pytorch_lightning/accelerator_backends/gpu_backend.py +++ b/pytorch_lightning/accelerator_backends/gpu_backend.py @@ -14,7 +14,7 @@ import torch -from pytorch_lightning import LightningModule +from pytorch_lightning.core import LightningModule try: from apex import amp except ImportError: diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 6238fadee4a56..3cebc7fbc81c6 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -15,8 +15,8 @@ import os import torch - import torch.multiprocessing as mp + from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -161,4 +161,5 @@ def __setup_tpu_training(cls, model: LightningModule, trainer=None): os.environ['XLA_USE_BF16'] = str(1) log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},' - f' global rank: {trainer.tpu_global_core_rank}') + f' global rank: {trainer.tpu_global_core_rank}' + f' with XLA_USE_BF16={os.environ["XLA_USE_BF16"]}') diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py index 8a7770752f951..43861ad6dd785 100644 --- a/pytorch_lightning/core/__init__.py +++ b/pytorch_lightning/core/__init__.py @@ -305,5 +305,9 @@ def training_step(self, batch, batch_idx): from pytorch_lightning.core.decorators import data_loader from pytorch_lightning.core.lightning import LightningModule -__all__ = ['LightningDataModule', 'LightningModule', 'data_loader'] +__all__ = [ + 'LightningDataModule', + 'LightningModule', + 'data_loader', +] # __call__ = __all__ diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py index b540c1b66bab3..6023de27d853b 100644 --- a/pytorch_lightning/core/decorators.py +++ b/pytorch_lightning/core/decorators.py @@ -1,8 +1,6 @@ from functools import wraps from typing import Callable -import torch - from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.utilities import rank_zero_warn diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 9eb8fc0f14051..04ea0ba88ee6e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -897,7 +897,7 @@ def test_trainer_config(trainer_kwargs, expected): assert trainer.use_ddp2 is expected["use_ddp2"] assert trainer.num_gpus == expected["num_gpus"] assert trainer.on_gpu is expected["on_gpu"] - assert trainer.use_single_gpu is expected["single_gpu"] + assert trainer.use_single_gpu is expected["use_single_gpu"] assert trainer.num_processes == expected["num_processes"] From 75dc5d27384e3ab6308349249261fdd8ba98223d Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 15:23:01 +0200 Subject: [PATCH 32/55] typo --- tests/trainer/test_trainer.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 04ea0ba88ee6e..e11ed287f924d 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -814,79 +814,79 @@ def test_num_sanity_val_steps(tmpdir, limit_val_batches): @pytest.mark.parametrize("trainer_kwargs,expected", [ pytest.param( dict(distributed_backend=None, gpus=None), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend="dp", gpus=None), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend="dp", gpus=None), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend="ddp", gpus=None), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend="ddp", num_processes=2, gpus=None), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2) + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2) ), pytest.param( dict(distributed_backend="ddp", num_nodes=2, gpus=None), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend="ddp_cpu", num_processes=2, gpus=None), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2) + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2) ), pytest.param( dict(distributed_backend="ddp2", gpus=None), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1) + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1) ), pytest.param( dict(distributed_backend=None, gpus=1), - dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1), + dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")] ), pytest.param( dict(distributed_backend="dp", gpus=1), - dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1), + dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")] ), pytest.param( dict(distributed_backend="ddp", gpus=1), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")] ), pytest.param( dict(distributed_backend="ddp_cpu", num_processes=2, gpus=1), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")] ), pytest.param( dict(distributed_backend="ddp2", gpus=1), - dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=1, on_gpu=True, single_gpu=False, num_processes=1), + dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=1, on_gpu=True, use_single_gpu=False, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")] ), pytest.param( dict(distributed_backend=None, gpus=2), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), pytest.param( dict(distributed_backend="dp", gpus=2), - dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), + dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), pytest.param( dict(distributed_backend="ddp", gpus=2), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), pytest.param( dict(distributed_backend="ddp2", gpus=2), - dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), + dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=1), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), ]) From 5a7bfa08ebbab25b78954701aad3d703df9e42d4 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 15:33:14 +0200 Subject: [PATCH 33/55] tests --- .github/workflows/ci-testing.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index 7c3095fc281a1..f08e54d136ef2 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -30,6 +30,10 @@ jobs: # TODO: temporary fix till hanging jobs on macOS for py38 is resolved - python-version: 3.8 os: macOS-10.15 + # TODO: temporary fix till pyYaml can be installed, see: https://github.com/actions/setup-python/issues/114 + - python-version: 3.7 + os: ubuntu-18.04 + requires: 'minimal' # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 25 From 22dd8c0d48eb491b3e162f5f8a29648176387667 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 15:36:41 +0200 Subject: [PATCH 34/55] tests --- pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 3cebc7fbc81c6..17be12b624f27 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -162,4 +162,4 @@ def __setup_tpu_training(cls, model: LightningModule, trainer=None): log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},' f' global rank: {trainer.tpu_global_core_rank}' - f' with XLA_USE_BF16={os.environ["XLA_USE_BF16"]}') + f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}') From e4f2088c7e77c1735dcae2840ce752df01eebf6b Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 15:57:30 +0200 Subject: [PATCH 35/55] tests --- pytorch_lightning/accelerator_backends/tpu_backend.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 17be12b624f27..18615862497d2 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -63,7 +63,7 @@ def teardown(self, model): # todo, pass also bets score # load last weights - if last_path is not None and not self.trainer.testing: + if last_path and not self.trainer.testing: ckpt = torch.load(last_path, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt) @@ -78,7 +78,7 @@ def train(self, model: LightningModule): # train if self.trainer.tpu_id is not None: - self.tpu_train_in_process(self.trainer.tpu_id, model) + self.tpu_train_in_process(self.trainer.tpu_id, model, self.trainer, self.mp_queue) else: xmp.spawn( self.tpu_train_in_process, @@ -118,15 +118,13 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine # persist info in spawn trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - @classmethod - def __save_end_of_training_weights(cls, model: LightningModule, trainer=None): + def __save_end_of_training_weights(self, model: LightningModule, trainer=None): # when training ends on these platforms dump weights to get out of the main process if trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') trainer.save_spawn_weights(model) - @classmethod - def __setup_tpu_training(cls, model: LightningModule, trainer=None): + def __setup_tpu_training(self, model: LightningModule, trainer=None): # use the default device from the process tpu_device = xm.xla_device() From d89de9b1e37eb96f7c3983ba08de6cb827c270f4 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 16:17:54 +0200 Subject: [PATCH 36/55] tests --- tests/models/test_tpu.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 397714820f54f..0a0436413d456 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -30,6 +30,7 @@ def test_model_tpu_cores_1(tmpdir): default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4 @@ -47,6 +48,7 @@ def test_model_tpu_index_1(tmpdir): default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=[1], limit_train_batches=0.4, limit_val_batches=0.4 @@ -65,6 +67,7 @@ def test_model_tpu_index_5(tmpdir): default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=[5], limit_train_batches=0.4, limit_val_batches=0.4 @@ -83,6 +86,7 @@ def test_model_tpu_cores_8(tmpdir): default_root_dir=tmpdir, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4 @@ -109,6 +113,7 @@ def test_model_16bit_tpu_cores_1(tmpdir): precision=16, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=1, limit_train_batches=0.4, limit_val_batches=0.4 @@ -128,6 +133,7 @@ def test_model_16bit_tpu_index_1(tmpdir): precision=16, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=[1], limit_train_batches=0.4, limit_val_batches=0.4 @@ -148,6 +154,7 @@ def test_model_16bit_tpu_cores_8(tmpdir): precision=16, progress_bar_refresh_rate=0, max_epochs=1, + distributed_backend='tpu', tpu_cores=8, limit_train_batches=0.4, limit_val_batches=0.4 @@ -176,6 +183,7 @@ def test_model_16bit_tpu_index_5(tmpdir): max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, + distributed_backend='tpu', tpu_cores=[5], ) trainer.fit(model) @@ -195,6 +203,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): max_epochs=50, limit_train_batches=10, limit_val_batches=10, + distributed_backend='tpu', tpu_cores=[1], ) trainer.fit(model) @@ -213,6 +222,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir): max_epochs=50, limit_train_batches=10, limit_val_batches=10, + distributed_backend='tpu', tpu_cores=[5], ) trainer.fit(model) @@ -229,6 +239,7 @@ def test_dataloaders_passed_to_fit(tmpdir): trainer = Trainer( default_root_dir=tmpdir, max_epochs=1, + distributed_backend='tpu', tpu_cores=8, ) result = trainer.fit( @@ -253,7 +264,10 @@ def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id): def test_tpu_misconfiguration(): """Test if trainer.tpu_id is set as expected""" with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"): - Trainer(tpu_cores=[1, 8]) + Trainer( + tpu_cores=[1, 8], + distributed_backend='tpu', + ) # @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False) @@ -266,6 +280,7 @@ def test_exception_when_no_tpu_found(tmpdir): max_epochs=1, train_percent_check=0.4, val_percent_check=0.2, + distributed_backend='tpu', tpu_cores=8, ) From 161472432c4c7dc5e2950ae3e43378e46b498e70 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 16:31:38 +0200 Subject: [PATCH 37/55] tests --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- tests/models/test_tpu.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index f8305136b7297..2546ec7bb3c7d 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -251,7 +251,7 @@ def is_function_implemented(self, *args) -> bool: def init_tpu(self): # turn off all the GPU stuff - self.distributed_backend = None + # self.distributed_backend = 'tpu' # enable tpu self.use_tpu = True diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 0a0436413d456..24b470381231e 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -33,7 +33,7 @@ def test_model_tpu_cores_1(tmpdir): distributed_backend='tpu', tpu_cores=1, limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -51,7 +51,7 @@ def test_model_tpu_index_1(tmpdir): distributed_backend='tpu', tpu_cores=[1], limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -70,7 +70,7 @@ def test_model_tpu_index_5(tmpdir): distributed_backend='tpu', tpu_cores=[5], limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -89,7 +89,7 @@ def test_model_tpu_cores_8(tmpdir): distributed_backend='tpu', tpu_cores=8, limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -116,7 +116,7 @@ def test_model_16bit_tpu_cores_1(tmpdir): distributed_backend='tpu', tpu_cores=1, limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -136,7 +136,7 @@ def test_model_16bit_tpu_index_1(tmpdir): distributed_backend='tpu', tpu_cores=[1], limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() @@ -157,7 +157,7 @@ def test_model_16bit_tpu_cores_8(tmpdir): distributed_backend='tpu', tpu_cores=8, limit_train_batches=0.4, - limit_val_batches=0.4 + limit_val_batches=0.4, ) model = EvalModelTemplate() From def846bdb47fe6246d114247fce4d79e541a3b12 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 17:12:16 +0200 Subject: [PATCH 38/55] tests --- .github/workflows/tpu-testing.yml | 4 ++-- tests/models/test_tpu.py | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index f5a08580ea737..6055696635bcf 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -14,8 +14,8 @@ env: GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image - MAX_CHECKS: 60 - CHECK_SPEEP: 30 + MAX_CHECKS: 180 + CHECK_SPEEP: 10 jobs: setup-build-publish-deploy: diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 24b470381231e..652f09762cbd0 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -22,6 +22,17 @@ TPU_AVAILABLE = True +# 8 cores needs a big dataset +def _long_train_loader(): + dataset = DataLoader(TrialMNIST( + download=True, + num_samples=2000, + digits=(0, 1, 2, 5, 8)), + batch_size=32, + ) + return dataset + + @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test def test_model_tpu_cores_1(tmpdir): @@ -93,13 +104,9 @@ def test_model_tpu_cores_8(tmpdir): ) model = EvalModelTemplate() - # 8 cores needs a big dataset - def long_train_loader(): - dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32) - return dataset - model.train_dataloader = long_train_loader - model.val_dataloader = long_train_loader + model.train_dataloader = _long_train_loader + model.val_dataloader = _long_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) @@ -161,15 +168,11 @@ def test_model_16bit_tpu_cores_8(tmpdir): ) model = EvalModelTemplate() - # 8 cores needs a big dataset - def long_train_loader(): - dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32) - return dataset - model.train_dataloader = long_train_loader - model.val_dataloader = long_train_loader + model.train_dataloader = _long_train_loader + model.val_dataloader = _long_train_loader - tpipes.run_model_test(trainer_options, model, on_gpu=False) + tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") From f6060ec802aea59bdeff5fe67f4fae8e719c47e1 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 17:41:44 +0200 Subject: [PATCH 39/55] tests --- .circleci/config.yml | 8 +++++--- .github/workflows/tpu-testing.yml | 3 ++- tests/base/datasets.py | 5 ++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a515ef544f565..34d92c3f57801 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -62,10 +62,11 @@ references: # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ + printf "Job not finished yet" && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep 30 && \ + sleep $CHECK_SPEEP && \ echo "JOB_NAME: $job_name" && \ gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \ if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ @@ -101,7 +102,8 @@ jobs: docker: - image: circleci/python:3.7 environment: - - MAX_CHECKS: 60 + - MAX_CHECKS: 180 + - CHECK_SPEEP: 10 steps: - checkout - go/install diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index 6055696635bcf..b4233ad69a9f7 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -90,7 +90,8 @@ jobs: # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ + printf "Job not finished yet" && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. sleep $CHECK_SPEEP && \ diff --git a/tests/base/datasets.py b/tests/base/datasets.py index e9b630d1da7c9..5e2737b39c03f 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -107,7 +107,7 @@ def _download(self, data_folder: str) -> None: def _try_load(path_data, trials: int = 30, delta: float = 0.3): res = None - assert os.path.isfile(path_data) + assert os.path.isfile(path_data), 'missing file: %s' % path_data for _ in range(trials): try: res = torch.load(path_data) @@ -115,6 +115,9 @@ def _try_load(path_data, trials: int = 30, delta: float = 0.3): time.sleep(delta) else: break + else: + import traceback + traceback.print_exc() return res From f0a61746dbd91966c54d988bfde776e4f8b6bf50 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:08:22 +0200 Subject: [PATCH 40/55] tests --- docs/source/new-project.rst | 2 -- tests/base/datasets.py | 5 ++--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 043461fca0b89..dd28bc311d401 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -6,8 +6,6 @@ import torch from torch.nn import functional as F from torch.utils.data import DataLoader - from torchvision.datasets import MNIST - from torchvision import transforms Quick Start diff --git a/tests/base/datasets.py b/tests/base/datasets.py index 5e2737b39c03f..b01d3385c45fa 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -111,13 +111,12 @@ def _try_load(path_data, trials: int = 30, delta: float = 0.3): for _ in range(trials): try: res = torch.load(path_data) - except Exception: + except Exception as ex: time.sleep(delta) else: break else: - import traceback - traceback.print_exc() + raise ex return res From bdaaaefcc8a8a9e461a957b3db1118dee3095603 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:18:22 +0200 Subject: [PATCH 41/55] tests --- .circleci/config.yml | 2 +- .github/workflows/tpu-testing.yml | 2 +- tests/models/test_grad_norm.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 34d92c3f57801..da4674dfda515 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -62,7 +62,7 @@ references: # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. - printf "Job not finished yet" && \ + printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index b4233ad69a9f7..011bb727cd9cb 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -90,7 +90,7 @@ jobs: # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. - printf "Job not finished yet" && \ + printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py index 7483167755a8f..dc7eee557d448 100644 --- a/tests/models/test_grad_norm.py +++ b/tests/models/test_grad_norm.py @@ -42,8 +42,8 @@ def on_after_backward(self): self.stored_grad_norms.append(out) -@pytest.mark.parametrize("norm_type", [1., 1.25, 1.5, 2, 3, 5, 10, 'inf']) -def test_grad_tracking(tmpdir, norm_type, rtol=1e-2): +@pytest.mark.parametrize("norm_type", [1., 1.25, 2, 3, 5, 10, 'inf']) +def test_grad_tracking(tmpdir, norm_type, rtol=5e-3): os.environ['PL_DEV_DEBUG'] = '1' # rtol=5e-3 respects the 3 decimals rounding in `.grad_norms` and above From 458471da880839b00ce1e920fd20e7c88506c52c Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:22:28 +0200 Subject: [PATCH 42/55] tests --- tests/base/datasets.py | 5 +++-- tests/models/test_tpu.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index b01d3385c45fa..b29984438f84b 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -1,5 +1,6 @@ import logging import os +import random import urllib.request import time from typing import Tuple, Optional, Sequence @@ -105,14 +106,14 @@ def _download(self, data_folder: str) -> None: urllib.request.urlretrieve(url, fpath) -def _try_load(path_data, trials: int = 30, delta: float = 0.3): +def _try_load(path_data, trials: int = 30, delta: float = 1.): res = None assert os.path.isfile(path_data), 'missing file: %s' % path_data for _ in range(trials): try: res = torch.load(path_data) except Exception as ex: - time.sleep(delta) + time.sleep(delta * random.random()) else: break else: diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 652f09762cbd0..f96be7e77fce9 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -26,9 +26,9 @@ def _long_train_loader(): dataset = DataLoader(TrialMNIST( download=True, - num_samples=2000, + num_samples=500, digits=(0, 1, 2, 5, 8)), - batch_size=32, + batch_size=16, ) return dataset From 6e6588a6d53dfeb65becc63669ad7d2b6f15f032 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:27:03 +0200 Subject: [PATCH 43/55] tests --- tests/models/test_tpu.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index f96be7e77fce9..704fbe214e521 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -12,6 +12,8 @@ try: import torch_xla + import torch_xla.distributed.xla_multiprocessing as xmp + SERIAL_EXEC = xmp.MpSerialExecutor() # TODO: The tests are aborted if the following lines are uncommented. Must be resolved with XLA team # device = torch_xla.core.xla_model.xla_device() # device_type = torch_xla.core.xla_model.xla_device_hw(device) @@ -24,11 +26,15 @@ # 8 cores needs a big dataset def _long_train_loader(): - dataset = DataLoader(TrialMNIST( - download=True, - num_samples=500, - digits=(0, 1, 2, 5, 8)), - batch_size=16, + # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb + dataset = DataLoader( + SERIAL_EXEC.run( + TrialMNIST( + download=True, + num_samples=2000, + digits=(0, 1, 2, 5, 8)), + batch_size=32, + ) ) return dataset From 6dc0e85479c4ac9286b91ce54afe397f8f1eeed2 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:45:57 +0200 Subject: [PATCH 44/55] tests --- tests/base/datasets.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index b29984438f84b..d92d5a35e088b 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -107,17 +107,21 @@ def _download(self, data_folder: str) -> None: def _try_load(path_data, trials: int = 30, delta: float = 1.): - res = None + res, exp = None, None + assert trials, "at least some trial has to be set" assert os.path.isfile(path_data), 'missing file: %s' % path_data for _ in range(trials): try: res = torch.load(path_data) except Exception as ex: + exp = ex time.sleep(delta * random.random()) else: break else: - raise ex + # raise the caught exception if any + if exp: + raise exp return res From 6196724b13fdb48e75c2d9e55415a4a66e8c89fb Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:50:09 +0200 Subject: [PATCH 45/55] tests --- tests/models/test_tpu.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 704fbe214e521..130781af5d761 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -32,9 +32,10 @@ def _long_train_loader(): TrialMNIST( download=True, num_samples=2000, - digits=(0, 1, 2, 5, 8)), - batch_size=32, - ) + digits=(0, 1, 2, 5, 8), + ) + ), + batch_size=32, ) return dataset From 183fcc2d851a9f2c84e2857b17696b41bb136cfc Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 18:56:58 +0200 Subject: [PATCH 46/55] tests --- tests/models/test_tpu.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 130781af5d761..c56ec4010bd83 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -28,12 +28,10 @@ def _long_train_loader(): # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb dataset = DataLoader( - SERIAL_EXEC.run( - TrialMNIST( - download=True, - num_samples=2000, - digits=(0, 1, 2, 5, 8), - ) + TrialMNIST( + download=True, + num_samples=2000, + digits=(0, 1, 2, 5, 8), ), batch_size=32, ) @@ -112,8 +110,8 @@ def test_model_tpu_cores_8(tmpdir): model = EvalModelTemplate() # 8 cores needs a big dataset - model.train_dataloader = _long_train_loader - model.val_dataloader = _long_train_loader + model.train_dataloader = SERIAL_EXEC.run(_long_train_loader) + model.val_dataloader = SERIAL_EXEC.run(_long_train_loader) tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) @@ -176,8 +174,8 @@ def test_model_16bit_tpu_cores_8(tmpdir): model = EvalModelTemplate() # 8 cores needs a big dataset - model.train_dataloader = _long_train_loader - model.val_dataloader = _long_train_loader + model.train_dataloader = SERIAL_EXEC.run(_long_train_loader) + model.val_dataloader = SERIAL_EXEC.run(_long_train_loader) tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) From e7a5295652dd4f89ea4d18daa1aac5d4362d3374 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 19:07:20 +0200 Subject: [PATCH 47/55] tests --- tests/models/test_tpu.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index c56ec4010bd83..ee17e2162ead3 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -25,14 +25,18 @@ # 8 cores needs a big dataset -def _long_train_loader(): +def _serial_train_loader(): # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb - dataset = DataLoader( - TrialMNIST( + + def _get_dataset(): + return TrialMNIST( download=True, num_samples=2000, digits=(0, 1, 2, 5, 8), - ), + ) + + dataset = DataLoader( + SERIAL_EXEC.run(_get_dataset), batch_size=32, ) return dataset @@ -110,8 +114,8 @@ def test_model_tpu_cores_8(tmpdir): model = EvalModelTemplate() # 8 cores needs a big dataset - model.train_dataloader = SERIAL_EXEC.run(_long_train_loader) - model.val_dataloader = SERIAL_EXEC.run(_long_train_loader) + model.train_dataloader = _serial_train_loader + model.val_dataloader = _serial_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) @@ -174,8 +178,8 @@ def test_model_16bit_tpu_cores_8(tmpdir): model = EvalModelTemplate() # 8 cores needs a big dataset - model.train_dataloader = SERIAL_EXEC.run(_long_train_loader) - model.val_dataloader = SERIAL_EXEC.run(_long_train_loader) + model.train_dataloader = _serial_train_loader + model.val_dataloader = _serial_train_loader tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) From 199919883e32c49a1341e090f38c2973e4266454 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 21:33:17 +0200 Subject: [PATCH 48/55] tests --- tests/models/test_tpu.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index ee17e2162ead3..89d5dce279840 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -24,22 +24,19 @@ TPU_AVAILABLE = True -# 8 cores needs a big dataset -def _serial_train_loader(): - # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb +_LARGER_DATASET = TrialMNIST( + download=True, + num_samples=2000, + digits=(0, 1, 2, 5, 8), +) - def _get_dataset(): - return TrialMNIST( - download=True, - num_samples=2000, - digits=(0, 1, 2, 5, 8), - ) - dataset = DataLoader( - SERIAL_EXEC.run(_get_dataset), +# 8 cores needs a big dataset +def _serial_train_loader(): + return DataLoader( + _LARGER_DATASET, batch_size=32, ) - return dataset @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine") From 9579bce6ce248ca27653311be1d510b9b3590463 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 21:54:09 +0200 Subject: [PATCH 49/55] tests --- pytorch_lightning/accelerator_backends/tpu_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 18615862497d2..7591f6e22e31c 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -46,7 +46,8 @@ def setup(self): raise MisconfigurationException('PyTorch XLA not installed.') # COLAB_GPU is an env var available by default in Colab environments. - self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn' + # see: https://discuss.pytorch.org/t/segfault-with-multiprocessing-queue/81292/2 + self.start_method = 'fork' # pass in a state q smp = mp.get_context(self.start_method) From fde01de908bcec61202bac0a0e196b2c2dc7539f Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 22:08:25 +0200 Subject: [PATCH 50/55] docs --- CHANGELOG.md | 2 ++ pytorch_lightning/accelerator_backends/tpu_backend.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af692fdd0f938..e65efa50cb38d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,6 +38,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `weights_save_path` getting ignored when `logger=False` is passed to Trainer ([#2681](https://github.com/PyTorchLightning/pytorch-lightning/pull/2681)) +- Fixed TPU multi-core and Float16 ([#2632](https://github.com/PyTorchLightning/pytorch-lightning/pull/2632)) + ## [0.8.5] - 2020-07-09 ### Added diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 7591f6e22e31c..b40ae3c23e7f8 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -45,7 +45,6 @@ def setup(self): if not XLA_AVAILABLE: raise MisconfigurationException('PyTorch XLA not installed.') - # COLAB_GPU is an env var available by default in Colab environments. # see: https://discuss.pytorch.org/t/segfault-with-multiprocessing-queue/81292/2 self.start_method = 'fork' From 6abee2084984b8d617d52be40b5a4cacf9c17a20 Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 22:22:18 +0200 Subject: [PATCH 51/55] docs --- .circleci/config.yml | 8 ++++---- .github/workflows/tpu-testing.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index da4674dfda515..ba24d3bb82086 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -63,10 +63,10 @@ references: # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep $CHECK_SPEEP && \ + sleep 10 && \ echo "JOB_NAME: $job_name" && \ gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \ if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ @@ -102,8 +102,8 @@ jobs: docker: - image: circleci/python:3.7 environment: - - MAX_CHECKS: 180 - - CHECK_SPEEP: 10 + - MAX_CHECKS: 240 + - CHECK_SPEEP: 5 steps: - checkout - go/install diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index 011bb727cd9cb..245e424181007 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -14,8 +14,8 @@ env: GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image - MAX_CHECKS: 180 - CHECK_SPEEP: 10 + MAX_CHECKS: 240 + CHECK_SPEEP: 5 jobs: setup-build-publish-deploy: @@ -91,10 +91,10 @@ jobs: # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ - while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \ + while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep $CHECK_SPEEP && \ + sleep 10 && \ echo "JOB_NAME: $job_name" && \ echo "GKE_CLUSTER: $GKE_CLUSTER" && \ echo "GKE_ZONE: $GKE_ZONE" && \ From bf6ac749eefa2ca2c07802fcb28934f01b7979ab Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 27 Jul 2020 22:31:44 +0200 Subject: [PATCH 52/55] Apply suggestions from code review Co-authored-by: Ethan Harris Co-authored-by: Rohit Gupta --- pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index b40ae3c23e7f8..fac8594f0358f 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -124,14 +124,15 @@ def __save_end_of_training_weights(self, model: LightningModule, trainer=None): rank_zero_warn('cleaning up... please do not interrupt') trainer.save_spawn_weights(model) - def __setup_tpu_training(self, model: LightningModule, trainer=None): + def __setup_tpu_training(self, model: LightningModule, trainer): # use the default device from the process tpu_device = xm.xla_device() # if given an ordinal device, use this as the device if trainer.tpu_id is not None: tpu_device = xm.xla_device(trainer.tpu_id) - + else: + tpu_device = xm.xla_device() # track the device and move model to it trainer._device = tpu_device model.to(trainer._device) From 6f30bc56d1cd1561ec77a3e7fa12b41382bd4a37 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 27 Jul 2020 22:56:25 +0200 Subject: [PATCH 53/55] Apply suggestions from code review Co-authored-by: Ethan Harris --- pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index fac8594f0358f..95203550eb37e 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -118,7 +118,7 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine # persist info in spawn trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results) - def __save_end_of_training_weights(self, model: LightningModule, trainer=None): + def __save_end_of_training_weights(self, model: LightningModule, trainer): # when training ends on these platforms dump weights to get out of the main process if trainer.on_colab_kaggle: rank_zero_warn('cleaning up... please do not interrupt') From 45233a50870c0de317fab1c999e82c005cb80d4d Mon Sep 17 00:00:00 2001 From: Jirka Date: Mon, 27 Jul 2020 23:17:04 +0200 Subject: [PATCH 54/55] docs --- tests/base/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/datasets.py b/tests/base/datasets.py index d92d5a35e088b..27e614eee68cf 100644 --- a/tests/base/datasets.py +++ b/tests/base/datasets.py @@ -63,7 +63,6 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True, raise RuntimeError('Dataset not found.') data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME - # FIXME: try to fix loading self.data, self.targets = _try_load(os.path.join(self.cached_folder_path, data_file)) def __getitem__(self, idx: int) -> Tuple[Tensor, int]: @@ -107,6 +106,7 @@ def _download(self, data_folder: str) -> None: def _try_load(path_data, trials: int = 30, delta: float = 1.): + """Resolving loading from the same time from multiple concurrentprocesses.""" res, exp = None, None assert trials, "at least some trial has to be set" assert os.path.isfile(path_data), 'missing file: %s' % path_data From 6c9495e04bbe602cb662fac8376c77dfe2280922 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 27 Jul 2020 23:05:36 +0200 Subject: [PATCH 55/55] Apply suggestions from code review Co-authored-by: Rohit Gupta --- pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py index 95203550eb37e..8d1d1b271b7dc 100644 --- a/pytorch_lightning/accelerator_backends/tpu_backend.py +++ b/pytorch_lightning/accelerator_backends/tpu_backend.py @@ -126,7 +126,7 @@ def __save_end_of_training_weights(self, model: LightningModule, trainer): def __setup_tpu_training(self, model: LightningModule, trainer): # use the default device from the process - tpu_device = xm.xla_device() + # tpu_device = xm.xla_device() # if given an ordinal device, use this as the device if trainer.tpu_id is not None: