From bacc2081188537f98981d0855e1ed18083454447 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 15:20:56 +0200
Subject: [PATCH 01/55] init

---
 .../test_trainer_test_loop.py}                     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)
 rename tests/{models/test_test_loop.py => trainer/test_trainer_test_loop.py} (95%)

diff --git a/tests/models/test_test_loop.py b/tests/trainer/test_trainer_test_loop.py
similarity index 95%
rename from tests/models/test_test_loop.py
rename to tests/trainer/test_trainer_test_loop.py
index 10d8d35800e81..e8151c2c5bd1d 100644
--- a/tests/models/test_test_loop.py
+++ b/tests/trainer/test_trainer_test_loop.py
@@ -1,9 +1,9 @@
-import os
+import pytest
+import torch
+
 import pytorch_lightning as pl
-from tests.base import EvalModelTemplate
 import tests.base.develop_utils as tutils
-import torch
-import pytest
+from tests.base import EvalModelTemplate
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
@@ -12,7 +12,7 @@ def test_single_gpu_test(tmpdir):
 
     model = EvalModelTemplate()
     trainer = pl.Trainer(
-        default_root_dir=os.getcwd(),
+        default_root_dir=tmpdir,
         max_epochs=2,
         limit_train_batches=10,
         limit_val_batches=10,
@@ -43,7 +43,7 @@ def test_dp_test(tmpdir):
 
     model = EvalModelTemplate()
     trainer = pl.Trainer(
-        default_root_dir=os.getcwd(),
+        default_root_dir=tmpdir,
         max_epochs=2,
         limit_train_batches=10,
         limit_val_batches=10,
@@ -72,7 +72,7 @@ def test_ddp_spawn_test(tmpdir):
 
     model = EvalModelTemplate()
     trainer = pl.Trainer(
-        default_root_dir=os.getcwd(),
+        default_root_dir=tmpdir,
         max_epochs=2,
         limit_train_batches=10,
         limit_val_batches=10,

From b93fab4d930dd68111c1969b243b9407c25472e6 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 16:12:34 +0200
Subject: [PATCH 02/55] rename

---
 .github/workflows/tpu-testing.yml | 11 ++++----
 tests/models/test_tpu.py          | 47 ++++++++++++++++---------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml
index d4ab768a885dd..f5a08580ea737 100644
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@@ -14,6 +14,8 @@ env:
   GKE_CLUSTER: lightning-cluster
   GKE_ZONE: us-central1-a
   IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
+  MAX_CHECKS: 60
+  CHECK_SPEEP: 30
 
 jobs:
   setup-build-publish-deploy:
@@ -82,17 +84,16 @@ jobs:
         job_name=${job_name% created} && \
         echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
         i=0 && \
-        # 30 checks spaced 30s apart = 900s total.
-        max_checks=30 && \
+        # 60 checks spaced 30s apart = 900s total.
         status_code=2 && \
         # Check on the job periodically. Set the status code depending on what
-        # happened to the job in Kubernetes. If we try max_checks times and
+        # happened to the job in Kubernetes. If we try MAX_CHECKS times and
         # still the job hasn't finished, give up and return the starting
         # non-zero status code.
-        while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
         echo "Done waiting. Job status code: $status_code" && \
         # Allow time for logs to flush.
-        sleep 60 && \
+        sleep $CHECK_SPEEP && \
         echo "JOB_NAME: $job_name" && \
         echo "GKE_CLUSTER: $GKE_CLUSTER" && \
         echo "GKE_ZONE: $GKE_ZONE" && \
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 754e2652225c0..116db49d7ed77 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -23,7 +23,7 @@
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_model_1(tmpdir):
+def test_model_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -39,7 +39,7 @@ def test_base_tpu_model_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_model_idx_1(tmpdir):
+def test_model_tpu_idx_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -55,7 +55,23 @@ def test_base_tpu_model_idx_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_model_8(tmpdir):
+def test_model_tpu_idx_1_8(tmpdir):
+    """Make sure model trains on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=[1, 8],
+        limit_train_batches=0.4,
+        limit_val_batches=0.4
+    )
+
+    model = EvalModelTemplate()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+def test_model_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -76,10 +92,11 @@ def long_train_loader():
     model.val_dataloader = long_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_16bit_model_core_1(tmpdir):
+def test_model_16bit_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -97,7 +114,7 @@ def test_base_tpu_16bit_model_core_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_16bit_model_idx_core(tmpdir):
+def test_model_16bit_tpu_idx_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -115,7 +132,7 @@ def test_base_tpu_16bit_model_idx_core(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_base_tpu_16bit_model_8_cores(tmpdir):
+def test_model_16bit_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -158,23 +175,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_single_tpu_core_model(tmpdir):
-    """Test if single TPU core training works"""
-    model = EvalModelTemplate()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        progress_bar_refresh_rate=0,
-        max_epochs=1,
-        train_percent_check=0.1,
-        val_percent_check=0.1,
-        tpu_cores=8,
-    )
-    trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
-
-
-@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_multi_core_tpu_model(tmpdir):
+def test_model_16bit_tpu_index_1_8(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(

From 15cb65cffaf1311a0d1e8d97da464ecb01eddafb Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 16:42:33 +0200
Subject: [PATCH 03/55] tpu_core_idx

---
 pytorch_lightning/trainer/distrib_parts.py | 1 -
 tests/models/test_tpu.py                   | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
index 3c0461e713832..f5dd04c1bd332 100644
--- a/pytorch_lightning/trainer/distrib_parts.py
+++ b/pytorch_lightning/trainer/distrib_parts.py
@@ -79,7 +79,6 @@ class TrainerDPMixin(ABC):
     use_tpu: bool
     data_parallel_device_ids: ...
     progress_bar_callback: ...
-    tpu_id: Optional[int]
     on_colab_kaggle: str
     save_spawn_weights: Callable
     logger: ...
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 116db49d7ed77..687de6649c828 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -52,22 +52,24 @@ def test_model_tpu_idx_1(tmpdir):
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_model_tpu_idx_1_8(tmpdir):
+def test_model_tpu_idx_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=[1, 8],
+        tpu_cores=[8, 1],
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -92,7 +94,6 @@ def long_train_loader():
     model.val_dataloader = long_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -128,6 +129,7 @@ def test_model_16bit_tpu_idx_1(tmpdir):
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 

From 5ac3c6118eda461b2e3a8cf559ecb4d54390fb73 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 16:43:51 +0200
Subject: [PATCH 04/55] idx 8

---
 tests/models/test_tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 687de6649c828..4c4b11ad628f0 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -62,7 +62,7 @@ def test_model_tpu_idx_8(tmpdir):
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=[8, 1],
+        tpu_cores=[8],
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )

From 724b7b4c32f86d00736260831857c1ff2d1bdc94 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 16:55:26 +0200
Subject: [PATCH 05/55] idxs

---
 tests/models/test_tpu.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 4c4b11ad628f0..461b143ee08bb 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -39,7 +39,7 @@ def test_model_tpu_cores_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_model_tpu_idx_1(tmpdir):
+def test_model_tpu_index_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -56,7 +56,7 @@ def test_model_tpu_idx_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_model_tpu_idx_8(tmpdir):
+def test_model_tpu_index_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -115,7 +115,7 @@ def test_model_16bit_tpu_cores_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_model_16bit_tpu_idx_1(tmpdir):
+def test_model_16bit_tpu_index_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
@@ -177,7 +177,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-def test_model_16bit_tpu_index_1_8(tmpdir):
+def test_model_16bit_tpu_index_8(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -185,7 +185,7 @@ def test_model_16bit_tpu_index_1_8(tmpdir):
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
-        tpu_cores=[1, 8],
+        tpu_cores=[8],
     )
     trainer.fit(model)
     assert trainer.tpu_id is None
@@ -221,7 +221,14 @@ def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id):
     assert Trainer(tpu_cores=tpu_cores).tpu_id == expected_tpu_id
 
 
+def test_tpu_misconfiguration():
+    """Test if trainer.tpu_id is set as expected"""
+    with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"):
+        Trainer(tpu_cores=[1, 8])
+
+
 @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False)
+@pytest.mark.skipif(TPU_AVAILABLE, reason="test requires missing TPU")
 def test_exception_when_no_tpu_found(tmpdir):
     """Test if exception is thrown when xla devices are not available"""
     model = EvalModelTemplate()

From d817f0ec3390afdf3ae5572aaf58aba8d923c19f Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 17 Jul 2020 21:44:44 +0200
Subject: [PATCH 06/55] @pl_multi_process_test

---
 tests/models/test_tpu.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 461b143ee08bb..09806db43313c 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -2,13 +2,14 @@
 from unittest.mock import patch
 
 import pytest
+from torch.utils.data import DataLoader
 
+import tests.base.develop_pipelines as tpipes
 from pytorch_lightning import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.base import EvalModelTemplate
-import tests.base.develop_pipelines as tpipes
 from tests.base.datasets import TrialMNIST
-from torch.utils.data import DataLoader
+from tests.base.develop_utils import pl_multi_process_test
 
 try:
     import torch_xla
@@ -23,6 +24,7 @@
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -39,6 +41,7 @@ def test_model_tpu_cores_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_tpu_index_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -56,6 +59,7 @@ def test_model_tpu_index_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_tpu_index_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -73,6 +77,7 @@ def test_model_tpu_index_8(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -97,6 +102,7 @@ def long_train_loader():
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_16bit_tpu_cores_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -115,6 +121,7 @@ def test_model_16bit_tpu_cores_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_16bit_tpu_index_1(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -134,6 +141,7 @@ def test_model_16bit_tpu_index_1(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_16bit_tpu_cores_8(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
@@ -160,6 +168,7 @@ def long_train_loader():
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_early_stop_checkpoints_on_tpu(tmpdir):
     """Test if single TPU core training works"""
     model = EvalModelTemplate()
@@ -170,13 +179,14 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
-        tpu_cores=[8],
+        tpu_cores=[1],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_model_16bit_tpu_index_8(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
@@ -192,6 +202,7 @@ def test_model_16bit_tpu_index_8(tmpdir):
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
 def test_dataloaders_passed_to_fit(tmpdir):
     """Test if dataloaders passed to trainer works on TPU"""
 

From be79c50a245741fb75a4097a076be9e6a270ecf7 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sat, 18 Jul 2020 00:01:37 +0200
Subject: [PATCH 07/55] assert

---
 tests/models/test_tpu.py | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 09806db43313c..10074854eea54 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -167,6 +167,23 @@ def long_train_loader():
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
+@pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
+def test_model_16bit_tpu_index_8(tmpdir):
+    """Test if distributed TPU core training works"""
+    model = EvalModelTemplate()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        train_percent_check=0.4,
+        val_percent_check=0.2,
+        tpu_cores=[8],
+    )
+    trainer.fit(model)
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
+    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
+
+
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_early_stop_checkpoints_on_tpu(tmpdir):
@@ -187,18 +204,20 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_16bit_tpu_index_8(tmpdir):
-    """Test if distributed TPU core training works"""
+def test_early_stop_checkpoints_on_tpu(tmpdir):
+    """Test if single TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
+        early_stop_callback=True,
         default_root_dir=tmpdir,
-        max_epochs=1,
-        train_percent_check=0.4,
-        val_percent_check=0.2,
+        progress_bar_refresh_rate=0,
+        max_epochs=50,
+        limit_train_batches=10,
+        limit_val_batches=10,
         tpu_cores=[8],
     )
     trainer.fit(model)
-    assert trainer.tpu_id is None
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -238,7 +257,7 @@ def test_tpu_misconfiguration():
         Trainer(tpu_cores=[1, 8])
 
 
-@patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False)
+# @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False)
 @pytest.mark.skipif(TPU_AVAILABLE, reason="test requires missing TPU")
 def test_exception_when_no_tpu_found(tmpdir):
     """Test if exception is thrown when xla devices are not available"""

From f35437f0d91bc629cfe72bf1dc5bd5504e5c44cf Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sat, 18 Jul 2020 00:27:14 +0200
Subject: [PATCH 08/55] assert

---
 tests/base/develop_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 068c8294ee5cb..6dc4cb2fe893c 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -100,7 +100,7 @@ def inner_f(queue, **kwargs):
             try:
                 func(**kwargs)
                 queue.put(1)
-            except Exception as e:
+            except Exception:
                 import traceback
                 traceback.print_exc()
                 queue.put(-1)
@@ -109,6 +109,6 @@ def inner_f(queue, **kwargs):
         p.start()
         p.join()
         result = queue.get()
-        assert result == 1
+        assert result == 1, 'expected 1, but returned %s' % result
 
     return wrapper

From 773e52a1aeafa7ebd17ea709dfbae548f685e8bd Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sat, 18 Jul 2020 01:06:01 +0200
Subject: [PATCH 09/55] deamon

---
 tests/base/develop_utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 6dc4cb2fe893c..978235935f899 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -89,6 +89,7 @@ def init_checkpoint_callback(logger):
 
 
 def pl_multi_process_test(func):
+    """Wrapper for running multi-processing tests."""
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
@@ -105,10 +106,13 @@ def inner_f(queue, **kwargs):
                 traceback.print_exc()
                 queue.put(-1)
 
-        p = Process(target=inner_f, args=(queue,), kwargs=kwargs)
-        p.start()
-        p.join()
+        proc = Process(target=inner_f, args=(queue,), kwargs=kwargs)
+        proc.start()
+        proc.join()
         result = queue.get()
         assert result == 1, 'expected 1, but returned %s' % result
 
+        proc.close()
+        queue.close()
+
     return wrapper

From 30cda419a8fac552274542c276f54a568571756b Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sat, 18 Jul 2020 01:51:50 +0200
Subject: [PATCH 10/55] no close

---
 tests/base/develop_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 978235935f899..0b23c61c48e10 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -109,10 +109,8 @@ def inner_f(queue, **kwargs):
         proc = Process(target=inner_f, args=(queue,), kwargs=kwargs)
         proc.start()
         proc.join()
+
         result = queue.get()
         assert result == 1, 'expected 1, but returned %s' % result
 
-        proc.close()
-        queue.close()
-
     return wrapper

From be98712d917b8ebb9e5c5602b97922adfb552692 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sun, 19 Jul 2020 00:34:31 +0200
Subject: [PATCH 11/55] imort

---
 tests/models/test_tpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 10074854eea54..dbb0a6a30f6b3 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -1,5 +1,4 @@
 import os
-from unittest.mock import patch
 
 import pytest
 from torch.utils.data import DataLoader

From f93c5aadd6890e3c996a49cdef0dedad37e99a09 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sun, 19 Jul 2020 18:39:24 +0200
Subject: [PATCH 12/55] msg

---
 tests/models/test_tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index dbb0a6a30f6b3..2e4ef0e4afcae 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -269,5 +269,5 @@ def test_exception_when_no_tpu_found(tmpdir):
         tpu_cores=8,
     )
 
-    with pytest.raises(MisconfigurationException, match='No TPU devices found.'):
+    with pytest.raises(MisconfigurationException, match='PyTorch XLA not installed.'):
         trainer.fit(model)

From 1dab0c85246685fef27955862a7cd857c66567c9 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Tue, 21 Jul 2020 09:04:24 +0200
Subject: [PATCH 13/55] use_single_gpu

---
 docs/source/index.rst                              |  2 +-
 pytorch_lightning/trainer/distrib_data_parallel.py | 10 ++++------
 pytorch_lightning/trainer/distrib_parts.py         |  4 ++--
 pytorch_lightning/trainer/evaluation_loop.py       |  4 ++--
 pytorch_lightning/trainer/trainer.py               | 12 ++++++------
 pytorch_lightning/trainer/training_loop.py         |  4 ++--
 tests/trainer/test_trainer.py                      |  2 +-
 7 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4b1b7c697a6c8..9ab0da242d330 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -93,7 +93,7 @@ PyTorch Lightning Documentation
    weights_loading
    optimizers
    profiler
-   single_gpu
+   use_single_gpu
    sequences
    training_tricks
    transfer_learning
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 63c6c9b8513ab..21019bff12cd5 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -171,8 +171,6 @@ def train_fx(trial_hparams, cluster_manager, _):
 
 try:
     import torch_xla
-    import torch_xla.core.xla_model as xm
-    import torch_xla.distributed.xla_multiprocessing as xmp
 except ImportError:
     XLA_AVAILABLE = False
 else:
@@ -263,7 +261,7 @@ def set_distributed_mode(self, distributed_backend):
         self.use_ddp = False
         self.use_ddp2 = False
         self.use_horovod = False
-        self.single_gpu = False
+        self.use_single_gpu = False
 
         if distributed_backend is None:
             if self.has_horovodrun():
@@ -272,7 +270,7 @@ def set_distributed_mode(self, distributed_backend):
                 if self.num_nodes > 1 or self.num_processes > 1:
                     self.use_ddp = True  # ddp_cpu
             elif self.num_gpus == 1:
-                self.single_gpu = True
+                self.use_single_gpu = True
             elif self.num_gpus > 1:
                 rank_zero_warn('You requested multiple GPUs but did not specify a backend, e.g.'
                                ' Trainer(distributed_backend=dp) (or ddp, ddp2).'
@@ -283,7 +281,7 @@ def set_distributed_mode(self, distributed_backend):
         if distributed_backend == "dp":
             # do nothing if num_gpus == 0
             if self.num_gpus == 1:
-                self.single_gpu = True
+                self.use_single_gpu = True
                 self.use_dp = True
             elif self.num_gpus > 1:
                 self.use_dp = True
@@ -293,7 +291,7 @@ def set_distributed_mode(self, distributed_backend):
                 if self.num_nodes > 1 or self.num_processes > 1:
                     self.use_ddp = True  # ddp_cpu
             elif self.num_gpus == 1:
-                self.single_gpu = True
+                self.use_single_gpu = True
                 self.use_ddp = True
             elif self.num_gpus > 1:
                 self.use_ddp = True
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
index f5dd04c1bd332..63db623d91f0b 100644
--- a/pytorch_lightning/trainer/distrib_parts.py
+++ b/pytorch_lightning/trainer/distrib_parts.py
@@ -69,7 +69,7 @@ class TrainerDPMixin(ABC):
     use_ddp2: bool
     use_ddp: bool
     testing: bool
-    single_gpu: bool
+    use_single_gpu: bool
     root_gpu: ...
     amp_level: str
     precision: ...
@@ -128,7 +128,7 @@ def copy_trainer_model_properties(self, model):
             m.use_ddp = self.use_ddp
             m.use_amp = self.use_amp
             m.testing = self.testing
-            m.single_gpu = self.single_gpu
+            m.use_single_gpu = self.use_single_gpu
             m.use_tpu = self.use_tpu
             m.tpu_local_core_rank = self.tpu_local_core_rank
             m.tpu_global_core_rank = self.tpu_global_core_rank
diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py
index f52f9c12f3c83..add9bb24c672a 100644
--- a/pytorch_lightning/trainer/evaluation_loop.py
+++ b/pytorch_lightning/trainer/evaluation_loop.py
@@ -163,7 +163,7 @@ class TrainerEvaluationLoopMixin(ABC):
     use_dp: bool
     use_ddp2: bool
     use_horovod: bool
-    single_gpu: bool
+    use_single_gpu: bool
     data_parallel_device_ids: ...
     model: LightningModule
     num_test_batches: List[int]
@@ -616,7 +616,7 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode:
             args[0] = batch
 
         # single GPU data transfer
-        if self.single_gpu:
+        if self.use_single_gpu:
             # for single GPU put inputs on gpu manually
             root_gpu = 0
             if isinstance(self.data_parallel_device_ids, list):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b774ce13e1ef0..ca81066fb4018 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -51,7 +51,7 @@
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
-from pytorch_lightning import accelerator_backends
+from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
@@ -1094,7 +1094,7 @@ def fit(
                 results = self.spawn_ddp_children(model)
 
         elif self.use_dp:
-            self.accelerator_backend = accelerator_backends.DataParallelBackend(self)
+            self.accelerator_backend = DataParallelBackend(self)
             self.accelerator_backend.setup(model)
             results = self.accelerator_backend.train()
             self.accelerator_backend.teardown()
@@ -1102,19 +1102,19 @@ def fit(
         elif self.use_horovod:
             results = self.horovod_train(model)
 
-        elif self.single_gpu:
-            self.accelerator_backend = accelerator_backends.GPUBackend(self)
+        elif self.use_single_gpu:
+            self.accelerator_backend = GPUBackend(self)
             model = self.accelerator_backend.setup(model)
             results = self.accelerator_backend.train(model)
 
         elif self.use_tpu:
-            self.accelerator_backend = accelerator_backends.TPUBackend(self)
+            self.accelerator_backend = TPUBackend(self)
             self.accelerator_backend.setup()
             self.accelerator_backend.train(model)
             self.accelerator_backend.teardown()
 
         else:
-            self.accelerator_backend = accelerator_backends.CPUBackend(self)
+            self.accelerator_backend = CPUBackend(self)
             self.accelerator_backend.setup(model)
             results = self.accelerator_backend.train(model)
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 7f12d20151c24..a0f0144679df9 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -217,7 +217,7 @@ class TrainerTrainLoopMixin(ABC):
     use_dp: bool
     use_ddp2: bool
     use_horovod: bool
-    single_gpu: bool
+    use_single_gpu: bool
     use_tpu: bool
     data_parallel_device_ids: ...
     check_val_every_n_epoch: ...
@@ -1068,7 +1068,7 @@ def training_forward(self, batch, batch_idx, opt_idx, hiddens):
             output = self.model.training_step(*args)
 
         # single GPU forward
-        elif self.single_gpu:
+        elif self.use_single_gpu:
             gpu_id = 0
             if isinstance(self.data_parallel_device_ids, list):
                 gpu_id = self.data_parallel_device_ids[0]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index bfca63be60aa0..9eb8fc0f14051 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -897,7 +897,7 @@ def test_trainer_config(trainer_kwargs, expected):
     assert trainer.use_ddp2 is expected["use_ddp2"]
     assert trainer.num_gpus == expected["num_gpus"]
     assert trainer.on_gpu is expected["on_gpu"]
-    assert trainer.single_gpu is expected["single_gpu"]
+    assert trainer.use_single_gpu is expected["single_gpu"]
     assert trainer.num_processes == expected["num_processes"]
 
 

From 910f1c4a07067ebcf6b44cd80c3f6646166970a4 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 22 Jul 2020 00:16:33 +0200
Subject: [PATCH 14/55] dataset

---
 tests/base/datasets.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index 5eb32656c83a3..b1991de6aa9e6 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import urllib.request
+import time
 from typing import Tuple, Optional, Sequence
 
 import torch
@@ -61,7 +62,14 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True,
             raise RuntimeError('Dataset not found.')
 
         data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
-        self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file))
+        # FIXME: try to fix loading
+        for _ in range(30):
+            try:
+                self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file))
+            except Exception:
+                time.sleep(1)
+            else:
+                break
 
     def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
         img = self.data[idx].float().unsqueeze(0)

From 410742a386fb54365d12197d6057c12abb2a11a0 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 22 Jul 2020 00:35:57 +0200
Subject: [PATCH 15/55] idx

---
 tests/models/test_tpu.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 2e4ef0e4afcae..b3aebda22cebe 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -59,13 +59,13 @@ def test_model_tpu_index_1(tmpdir):
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_tpu_index_8(tmpdir):
+def test_model_tpu_index_7(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=[8],
+        tpu_cores=[7],
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )
@@ -168,7 +168,7 @@ def long_train_loader():
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_16bit_tpu_index_8(tmpdir):
+def test_model_16bit_tpu_index_7(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -176,7 +176,7 @@ def test_model_16bit_tpu_index_8(tmpdir):
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
-        tpu_cores=[8],
+        tpu_cores=[7],
     )
     trainer.fit(model)
     assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'

From 2ad7b42ae023d043db6d1e9931036a9fb75a6777 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 22 Jul 2020 23:30:31 +0200
Subject: [PATCH 16/55] fix idx

---
 tests/models/test_tpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index b3aebda22cebe..d0e752c9018ba 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -179,7 +179,7 @@ def test_model_16bit_tpu_index_7(tmpdir):
         tpu_cores=[7],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:7'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
@@ -216,7 +216,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         tpu_cores=[8],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:7'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")

From 7576549c19db673dee41a825ff49511e2e4007e7 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Thu, 23 Jul 2020 00:12:21 +0200
Subject: [PATCH 17/55] dataset

---
 tests/base/datasets.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index b1991de6aa9e6..e55ef3484f7ea 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -63,13 +63,7 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True,
 
         data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
         # FIXME: try to fix loading
-        for _ in range(30):
-            try:
-                self.data, self.targets = torch.load(os.path.join(self.cached_folder_path, data_file))
-            except Exception:
-                time.sleep(1)
-            else:
-                break
+        self.data, self.targets = _try_load(os.path.join(self.cached_folder_path, data_file))
 
     def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
         img = self.data[idx].float().unsqueeze(0)
@@ -111,6 +105,19 @@ def _download(self, data_folder: str) -> None:
             urllib.request.urlretrieve(url, fpath)
 
 
+def _try_load(path_data, trials=30):
+    res = None
+    assert os.path.isfile(path_data)
+    for _ in range(trials):
+        try:
+            res = torch.load(path_data)
+        except Exception:
+            time.sleep(1)
+        else:
+            break
+    return res
+
+
 def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor:
     tensor = tensor.clone()
     mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device)
@@ -195,7 +202,7 @@ def prepare_data(self, download: bool) -> None:
         for fname in (self.TRAIN_FILE_NAME, self.TEST_FILE_NAME):
             path_fname = os.path.join(super().cached_folder_path, fname)
             assert os.path.isfile(path_fname), 'Missing cached file: %s' % path_fname
-            data, targets = torch.load(path_fname)
+            data, targets = _try_load(path_fname)
             data, targets = self._prepare_subset(data, targets, self.num_samples, self.digits)
             torch.save((data, targets), os.path.join(self.cached_folder_path, fname))
 

From accea7fdff536fb9b50c3c5fe5a93873a3425fcf Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Thu, 23 Jul 2020 12:08:00 +0200
Subject: [PATCH 18/55] format

---
 pytorch_lightning/trainer/distrib_data_parallel.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 21019bff12cd5..d2cdb532510e1 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -176,9 +176,9 @@ def train_fx(trial_hparams, cluster_manager, _):
 else:
     XLA_AVAILABLE = True
 
-pid = os.getpid()
-rng1 = np.random.RandomState(pid)
-RANDOM_PORTS = rng1.randint(10000, 19999, 1000)
+PID = os.getpid()
+RNG1 = np.random.RandomState(PID)
+RANDOM_PORTS = RNG1.randint(10000, 19999, 1000)
 
 
 class TrainerDDPMixin(ABC):
@@ -362,7 +362,6 @@ def configure_slurm_ddp(self, num_gpu_nodes):
     def determine_local_rank(self):
         if self.is_slurm_managing_tasks:
             return int(os.environ['SLURM_LOCALID'])
-
         else:
             return int(os.environ.get('LOCAL_RANK', 0))
 

From 85759bdb5185d46c9311d7940e546ebd2706567e Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Fri, 24 Jul 2020 01:14:58 +0200
Subject: [PATCH 19/55] add pickable

---
 tests/base/test_datasets.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/base/test_datasets.py

diff --git a/tests/base/test_datasets.py b/tests/base/test_datasets.py
new file mode 100644
index 0000000000000..2b3cd603b74cd
--- /dev/null
+++ b/tests/base/test_datasets.py
@@ -0,0 +1,19 @@
+import pickle
+
+import cloudpickle
+import pytest
+
+from tests.base.datasets import MNIST, TrialMNIST, AverageDataset
+
+
+@pytest.mark.parametrize('dataset_cls', [MNIST, TrialMNIST, AverageDataset])
+def test_pickling_dataset_mnist(tmpdir, dataset_cls):
+    mnist = dataset_cls()
+
+    mnist_pickled = pickle.dumps(mnist)
+    mnist_loaded = pickle.loads(mnist_pickled)
+    # assert vars(mnist) == vars(mnist_loaded)
+
+    mnist_pickled = cloudpickle.dumps(mnist)
+    mnist_loaded = cloudpickle.loads(mnist_pickled)
+    # assert vars(mnist) == vars(mnist_loaded)

From c8e7a7032397e507b612c22f39a23e5ad2d57ae6 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sat, 25 Jul 2020 22:08:11 +0200
Subject: [PATCH 20/55] typo

---
 tests/models/test_tpu.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index d0e752c9018ba..92c336e0acf39 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -59,20 +59,20 @@ def test_model_tpu_index_1(tmpdir):
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_tpu_index_7(tmpdir):
+def test_model_tpu_index_5(tmpdir):
     """Make sure model trains on TPU."""
     trainer_options = dict(
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
-        tpu_cores=[7],
+        tpu_cores=[5],
         limit_train_batches=0.4,
         limit_val_batches=0.4
     )
 
     model = EvalModelTemplate()
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -168,18 +168,19 @@ def long_train_loader():
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
-def test_model_16bit_tpu_index_7(tmpdir):
+def test_model_16bit_tpu_index_5(tmpdir):
     """Test if distributed TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
         default_root_dir=tmpdir,
+        precision=16,
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
-        tpu_cores=[7],
+        tpu_cores=[5],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:7'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
     assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
@@ -213,10 +214,10 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
-        tpu_cores=[8],
+        tpu_cores=[5],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:7'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:5'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")

From 19919a35b4e0fb5904f4a447854ab4ac142c884a Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sun, 26 Jul 2020 00:12:51 +0200
Subject: [PATCH 21/55] apex

---
 pytorch_lightning/accelerator_backends/gpu_backend.py |  5 +++--
 pytorch_lightning/accelerator_backends/tpu_backend.py | 11 +++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/gpu_backend.py b/pytorch_lightning/accelerator_backends/gpu_backend.py
index 81128e1009425..72b9002ca5668 100644
--- a/pytorch_lightning/accelerator_backends/gpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/gpu_backend.py
@@ -14,6 +14,7 @@
 
 import torch
 
+from pytorch_lightning import LightningModule
 try:
     from apex import amp
 except ImportError:
@@ -45,7 +46,7 @@ def setup(self, model):
 
         # TODO: remove with dropping NVIDIA AMP support
         native_amp_available = hasattr(torch.cuda, "amp") and hasattr(torch.cuda.amp, "autocast")
-        if self.trainer.use_amp and not native_amp_available:
+        if APEX_AVAILABLE and self.trainer.use_amp and not native_amp_available:
             model = self._setup_nvidia_apex(model)
         return model
 
@@ -53,7 +54,7 @@ def train(self, model):
         results = self.trainer.run_pretrain_routine(model)
         return results
 
-    def _setup_nvidia_apex(self, model):
+    def _setup_nvidia_apex(self, model: LightningModule):
         model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level)
         self.trainer.optimizers = optimizers
         self.trainer.reinit_scheduler_properties(self.trainer.optimizers, self.trainer.lr_schedulers)
diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 27b6ce0f92416..ee4088e5e7308 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -15,8 +15,7 @@
 import os
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning import _logger as log
-
+from pytorch_lightning import _logger as log, LightningModule
 
 try:
     import torch_xla
@@ -48,7 +47,7 @@ def teardown(self):
         # when training completes, load the weights back in main process
         self.__load_weights_on_main_process()
 
-    def train(self, model):
+    def train(self, model: LightningModule):
         self.trainer.model = model
 
         # train
@@ -71,7 +70,7 @@ def __load_weights_on_main_process(self):
 
         self.trainer.model = model
 
-    def tpu_train_in_process(self, tpu_core_idx, model):
+    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule):
         """
         Here we are inside each individual process
         """
@@ -88,14 +87,14 @@ def tpu_train_in_process(self, tpu_core_idx, model):
         # save weights at the end of training
         self.__save_end_of_training_weights(model)
 
-    def __save_end_of_training_weights(self, model):
+    def __save_end_of_training_weights(self, model: LightningModule):
 
         # when training ends on these platforms dump weights to get out of the main process
         if self.trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
             self.trainer.save_spawn_weights(model)
 
-    def __setup_tpu_training(self, model):
+    def __setup_tpu_training(self, model: LightningModule):
         # use the default device from the process
         tpu_device = xm.xla_device()
 

From 4ed34a6aa78901ae222740b93f29e51cd7e5cc80 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sun, 26 Jul 2020 00:22:59 +0200
Subject: [PATCH 22/55] typo

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +-
 tests/base/datasets.py                                | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index ee4088e5e7308..c66f4726b2219 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -37,7 +37,7 @@ def setup(self):
         rank_zero_info(f'training on {self.trainer.tpu_cores} TPU cores')
 
         if not XLA_AVAILABLE:
-            raise MisconfigurationException('No TPU devices found.')
+            raise MisconfigurationException('PyTorch XLA not installed.')
 
         #  COLAB_GPU is an env var available by default in Colab environments.
         self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn'
diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index e55ef3484f7ea..e9b630d1da7c9 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -105,14 +105,14 @@ def _download(self, data_folder: str) -> None:
             urllib.request.urlretrieve(url, fpath)
 
 
-def _try_load(path_data, trials=30):
+def _try_load(path_data, trials: int = 30, delta: float = 0.3):
     res = None
     assert os.path.isfile(path_data)
     for _ in range(trials):
         try:
             res = torch.load(path_data)
         except Exception:
-            time.sleep(1)
+            time.sleep(delta)
         else:
             break
     return res

From b11c6bc424339ad61f85d0b0eacc735e1663cb2a Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 09:31:22 +0200
Subject: [PATCH 23/55] wip

---
 .../accelerator_backends/ddp_spawn_backend.py |  6 ++--
 .../accelerator_backends/tpu_backend.py       | 31 ++++++++++++++++++-
 .../trainer/distrib_data_parallel.py          |  8 ++---
 pytorch_lightning/trainer/trainer.py          | 13 ++++----
 tests/base/develop_utils.py                   |  2 +-
 tests/models/test_tpu.py                      |  4 +--
 6 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
index 7cd05fea87ca4..c23d12dbc5e8b 100644
--- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
@@ -59,13 +59,13 @@ def teardown(self, model):
         self.trainer.model = model
         return results
 
-    def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
+    def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0):
         """
         Entry point for ddp
 
         Args:
             process_idx:
-            q:
+            mp_queue:
             model:
             is_master:
             proc_offset:
@@ -166,7 +166,7 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
         model = self.trainer.get_model()
 
         # persist info in ddp_spawn
-        self.trainer.transfer_ddp_spawn_state_on_fit_end(model, q, results)
+        self.trainer.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results)
 
         # clean up memory
         torch.cuda.empty_cache()
diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index c66f4726b2219..e6a60581431f6 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -61,6 +61,35 @@ def train(self, model: LightningModule):
                 start_method=self.start_method
             )
 
+    def _run_spawn(self, model, nprocs):
+
+        # pass in a state q
+        smp = xmp.get_context('spawn')
+        queue = smp.SimpleQueue()
+
+        xmp.spawn(
+            self.tpu_train_in_process,
+            args=(model, queue),
+            nprocs=self.trainer.tpu_cores,
+            start_method=self.start_method
+        )
+
+        # restore main state with best weights
+        best_path = queue.get()
+        results = queue.get()
+        last_path = queue.get()
+
+        # transfer back the best path to the trainer
+        self.checkpoint_callback.best_model_path = best_path
+
+        # load last weights
+        if last_path is not None and not self.testing:
+            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
+
+        self.model = model
+        return results
+
     def __load_weights_on_main_process(self):
         model = self.trainer.model
 
@@ -70,7 +99,7 @@ def __load_weights_on_main_process(self):
 
         self.trainer.model = model
 
-    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule):
+    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue):
         """
         Here we are inside each individual process
         """
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index d2cdb532510e1..1175dbfa3c223 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -473,18 +473,18 @@ def spawn_ddp_children(self, model):
             sleep(delay)
 
         local_rank = 0
-        results = self.ddp_train(local_rank, q=None, model=model, is_master=True)
+        results = self.ddp_train(local_rank, mp_queue=None, model=model, is_master=True)
         del os.environ['WORLD_SIZE']
 
         return results
 
-    def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
+    def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0):
         """
         Entry point for ddp
 
         Args:
             process_idx:
-            q:
+            mp_queue: multiprocessing queue
             model:
             is_master:
             proc_offset:
@@ -577,7 +577,7 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
         model = self.get_model()
 
         # persist info in ddp_spawn
-        self.transfer_ddp_spawn_state_on_fit_end(model, q, results)
+        self.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results)
 
         # clean up memory
         torch.cuda.empty_cache()
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ca81066fb4018..d4ae6f68adc60 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -51,7 +51,8 @@
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
-from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend
+from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, \
+    DataParallelBackend
 
 # warnings to ignore in trainer
 warnings.filterwarnings(
@@ -1061,7 +1062,7 @@ def fit(
             elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ):
                 task = int(os.environ['LOCAL_RANK'])
 
-            self.ddp_train(process_idx=task, q=None, model=model)
+            self.ddp_train(process_idx=task, mp_queue=None, model=model)
 
         elif self.use_ddp:
 
@@ -1070,21 +1071,21 @@ def fit(
 
             if self.is_slurm_managing_tasks:
                 task = int(os.environ['SLURM_LOCALID'])
-                self.ddp_train(process_idx=task, q=None, model=model)
+                self.ddp_train(process_idx=task, mp_queue=None, model=model)
 
             # torchelastic or general non_slurm ddp
             elif 'WORLD_SIZE' in os.environ and ('GROUP_RANK' in os.environ or 'NODE_RANK' in os.environ):
                 task = int(os.environ['LOCAL_RANK'])
-                self.ddp_train(process_idx=task, q=None, model=model)
+                self.ddp_train(process_idx=task, mp_queue=None, model=model)
 
             elif self.distributed_backend == 'ddp_cpu':
-                self.accelerator_backend = accelerator_backends.DDPSpawnBackend(self)
+                self.accelerator_backend = DDPSpawnBackend(self)
                 self.accelerator_backend.setup()
                 self.accelerator_backend.train(model, nprocs=self.num_processes)
                 results = self.accelerator_backend.teardown(model)
 
             elif self.distributed_backend == 'ddp_spawn':
-                self.accelerator_backend = accelerator_backends.DDPSpawnBackend(self)
+                self.accelerator_backend = DDPSpawnBackend(self)
                 self.accelerator_backend.setup()
                 self.accelerator_backend.train(model, nprocs=self.num_processes)
                 results = self.accelerator_backend.teardown(model)
diff --git a/tests/base/develop_utils.py b/tests/base/develop_utils.py
index 0b23c61c48e10..ada745951494c 100644
--- a/tests/base/develop_utils.py
+++ b/tests/base/develop_utils.py
@@ -60,7 +60,7 @@ def get_data_path(expt_logger, path_dir=None):
     return path_expt
 
 
-def load_model_from_checkpoint(logger, root_weights_dir, module_class=EvalModelTemplate, path_expt=None):
+def load_model_from_checkpoint(logger, root_weights_dir, module_class=EvalModelTemplate):
     trained_model = module_class.load_from_checkpoint(root_weights_dir)
     assert trained_model is not None, 'loading model failed'
     return trained_model
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 92c336e0acf39..b5377a3af3f66 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -92,7 +92,7 @@ def test_model_tpu_cores_8(tmpdir):
 
     # 8 cores needs a big dataset
     def long_train_loader():
-        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
+        dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32)
         return dataset
     model.train_dataloader = long_train_loader
     model.val_dataloader = long_train_loader
@@ -157,7 +157,7 @@ def test_model_16bit_tpu_cores_8(tmpdir):
 
     # 8 cores needs a big dataset
     def long_train_loader():
-        dataset = DataLoader(TrialMNIST(download=True, num_samples=15000, digits=(0, 1, 2, 5, 8)), batch_size=32)
+        dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32)
         return dataset
     model.train_dataloader = long_train_loader
     model.val_dataloader = long_train_loader

From dee7fa50dfbe308392d15feca340b43faefbb2b1 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 09:58:40 +0200
Subject: [PATCH 24/55] wip

---
 .../accelerator_backends/ddp_spawn_backend.py | 17 +++---
 .../accelerator_backends/tpu_backend.py       | 56 ++++++++-----------
 .../trainer/distrib_data_parallel.py          | 16 +++---
 3 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
index c23d12dbc5e8b..6aee68f6634f2 100644
--- a/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
+++ b/pytorch_lightning/accelerator_backends/ddp_spawn_backend.py
@@ -30,26 +30,27 @@ class DDPSpawnBackend(object):
 
     def __init__(self, trainer):
         self.trainer = trainer
-        self.q = None
+        self.mp_queue = None
 
     def setup(self):
         self.trainer.set_random_port()
 
         # pass in a state q
         smp = mp.get_context('spawn')
-        self.q = smp.SimpleQueue()
+        self.mp_queue = smp.SimpleQueue()
 
     def train(self, model, nprocs):
-        mp.spawn(self.ddp_train, nprocs=nprocs, args=(self.q, model,))
+        mp.spawn(self.ddp_train, nprocs=nprocs, args=(self.mp_queue, model,))
 
     def teardown(self, model):
         # restore main state with best weights
-        best_path = self.q.get()
-        results = self.q.get()
-        last_path = self.q.get()
+        best_path = self.mp_queue.get()
+        results = self.mp_queue.get()
+        last_path = self.mp_queue.get()
 
         # transfer back the best path to the trainer
         self.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also bets score
 
         # load last weights
         if last_path is not None and not self.trainer.testing:
@@ -65,7 +66,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
 
         Args:
             process_idx:
-            mp_queue:
+            mp_queue: multiprocessing queue
             model:
             is_master:
             proc_offset:
@@ -166,7 +167,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         model = self.trainer.get_model()
 
         # persist info in ddp_spawn
-        self.trainer.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results)
+        self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
         # clean up memory
         torch.cuda.empty_cache()
diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index e6a60581431f6..da74c7117ee1d 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 import os
+
+import torch
+
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning import _logger as log, LightningModule
@@ -32,6 +35,7 @@ class TPUBackend(object):
     def __init__(self, trainer):
         self.trainer = trainer
         self.start_method = None
+        self.mp_queue = None
 
     def setup(self):
         rank_zero_info(f'training on {self.trainer.tpu_cores} TPU cores')
@@ -42,10 +46,23 @@ def setup(self):
         #  COLAB_GPU is an env var available by default in Colab environments.
         self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn'
 
+        # pass in a state q
+        smp = xmp.get_context(self.start_method)
+        self.mp_queue = smp.SimpleQueue()
+
     def teardown(self):
+        # restore main state with best weights
+        best_path = self.mp_queue.get()
+        results = self.mp_queue.get()
+        last_path = self.mp_queue.get()
+
+        # transfer back the best path to the trainer
+        self.trainer.checkpoint_callback.best_model_path = best_path
+        # todo, pass also bets score
 
         # when training completes, load the weights back in main process
         self.__load_weights_on_main_process()
+        return results
 
     def train(self, model: LightningModule):
         self.trainer.model = model
@@ -56,40 +73,11 @@ def train(self, model: LightningModule):
         else:
             xmp.spawn(
                 self.tpu_train_in_process,
-                args=(model,),
+                args=(model, self.mp_queue),
                 nprocs=self.trainer.tpu_cores,
                 start_method=self.start_method
             )
 
-    def _run_spawn(self, model, nprocs):
-
-        # pass in a state q
-        smp = xmp.get_context('spawn')
-        queue = smp.SimpleQueue()
-
-        xmp.spawn(
-            self.tpu_train_in_process,
-            args=(model, queue),
-            nprocs=self.trainer.tpu_cores,
-            start_method=self.start_method
-        )
-
-        # restore main state with best weights
-        best_path = queue.get()
-        results = queue.get()
-        last_path = queue.get()
-
-        # transfer back the best path to the trainer
-        self.checkpoint_callback.best_model_path = best_path
-
-        # load last weights
-        if last_path is not None and not self.testing:
-            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt)
-
-        self.model = model
-        return results
-
     def __load_weights_on_main_process(self):
         model = self.trainer.model
 
@@ -99,7 +87,7 @@ def __load_weights_on_main_process(self):
 
         self.trainer.model = model
 
-    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue):
+    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue=None):
         """
         Here we are inside each individual process
         """
@@ -111,13 +99,15 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_que
         self.__setup_tpu_training(model)
 
         # Run the pretrain routine
-        self.trainer.run_pretrain_routine(model)
+        results = self.trainer.run_pretrain_routine(model)
 
         # save weights at the end of training
         self.__save_end_of_training_weights(model)
 
-    def __save_end_of_training_weights(self, model: LightningModule):
+        # persist info in spawn
+        self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
+    def __save_end_of_training_weights(self, model: LightningModule):
         # when training ends on these platforms dump weights to get out of the main process
         if self.trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 1175dbfa3c223..10a1ea6b768d2 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -577,7 +577,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         model = self.get_model()
 
         # persist info in ddp_spawn
-        self.transfer_ddp_spawn_state_on_fit_end(model, mp_queue, results)
+        self.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
         # clean up memory
         torch.cuda.empty_cache()
@@ -585,7 +585,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
         if self.global_rank == 0 and self.distributed_backend not in ['ddp_spawn', 'ddp_cpu']:
             return results
 
-    def transfer_ddp_spawn_state_on_fit_end(self, model, q, results):
+    def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
         if self.distributed_backend not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
             return
 
@@ -594,17 +594,17 @@ def transfer_ddp_spawn_state_on_fit_end(self, model, q, results):
         if self.checkpoint_callback is not None:
             best_model_path = self.checkpoint_callback.best_model_path
 
-        if self.global_rank == 0 and q is not None:
+        if self.global_rank == 0 and mp_queue is not None:
             rank_zero_warn('cleaning up ddp environment...')
-            q.put(best_model_path)
-            q.put(results)
+            mp_queue.put(best_model_path)
+            mp_queue.put(results)
 
             # save the last weights
             last_path = None
             if not self.testing and best_model_path is not None and len(best_model_path) > 0:
                 last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
                 torch.save(model.state_dict(), last_path)
-            q.put(last_path)
+            mp_queue.put(last_path)
 
     def save_spawn_weights(self, model):
         """
@@ -613,7 +613,7 @@ def save_spawn_weights(self, model):
         :return:
         """
         if self.is_global_zero:
-            path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt')
+            path = os.path.join(self.default_root_dir, '__temp_weight_distributed_end.ckpt')
             self.save_checkpoint(path)
             return path
 
@@ -629,7 +629,7 @@ def load_spawn_weights(self, original_model):
 
         if self.is_global_zero:
             # load weights saved in ddp
-            path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt')
+            path = os.path.join(self.default_root_dir, '__temp_weight_distributed_end.ckpt')
             loaded_model = original_model.__class__.load_from_checkpoint(path)
 
             # copy loaded weights to old model

From e2ece1bd414799fce54fb165c6bbe9958b740b07 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 10:38:54 +0200
Subject: [PATCH 25/55] wip

---
 .../accelerator_backends/tpu_backend.py              | 12 ++++++++++--
 pytorch_lightning/trainer/trainer.py                 |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index da74c7117ee1d..a7bf4ddc4dc9c 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -16,6 +16,7 @@
 
 import torch
 
+import torch.multiprocessing as mp
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning import _logger as log, LightningModule
@@ -47,10 +48,10 @@ def setup(self):
         self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn'
 
         # pass in a state q
-        smp = xmp.get_context(self.start_method)
+        smp = mp.get_context('spawn')
         self.mp_queue = smp.SimpleQueue()
 
-    def teardown(self):
+    def teardown(self, model):
         # restore main state with best weights
         best_path = self.mp_queue.get()
         results = self.mp_queue.get()
@@ -60,6 +61,13 @@ def teardown(self):
         self.trainer.checkpoint_callback.best_model_path = best_path
         # todo, pass also bets score
 
+        # load last weights
+        if last_path is not None and not self.trainer.testing:
+            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
+
+        self.trainer.model = model
+
         # when training completes, load the weights back in main process
         self.__load_weights_on_main_process()
         return results
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index d4ae6f68adc60..4296ec86b5e96 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1112,7 +1112,7 @@ def fit(
             self.accelerator_backend = TPUBackend(self)
             self.accelerator_backend.setup()
             self.accelerator_backend.train(model)
-            self.accelerator_backend.teardown()
+            self.accelerator_backend.teardown(model)
 
         else:
             self.accelerator_backend = CPUBackend(self)

From 6c11daeeaa248c4aa2482514a899fe8638c9d21f Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 11:02:57 +0200
Subject: [PATCH 26/55] wip

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++--
 pytorch_lightning/trainer/trainer.py                  | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index a7bf4ddc4dc9c..2e72ea60d3f3d 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -17,9 +17,10 @@
 import torch
 
 import torch.multiprocessing as mp
+from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning import _logger as log, LightningModule
+from pytorch_lightning import _logger as log
 
 try:
     import torch_xla
@@ -48,7 +49,7 @@ def setup(self):
         self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn'
 
         # pass in a state q
-        smp = mp.get_context('spawn')
+        smp = mp.get_context(self.start_method)
         self.mp_queue = smp.SimpleQueue()
 
     def teardown(self, model):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4296ec86b5e96..a2181c0bca6cc 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -51,8 +51,8 @@
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.trainer.configuration_validator import ConfigValidator
-from pytorch_lightning.accelerator_backends import GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, \
-    DataParallelBackend
+from pytorch_lightning.accelerator_backends import (
+    GPUBackend, TPUBackend, CPUBackend, DDPSpawnBackend, DataParallelBackend)
 
 # warnings to ignore in trainer
 warnings.filterwarnings(

From c21628399995cd69cfcbca1cbaddfe1d3549f93a Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 12:23:43 +0200
Subject: [PATCH 27/55] wip

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 2e72ea60d3f3d..2f3436567a566 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -63,9 +63,9 @@ def teardown(self, model):
         # todo, pass also bets score
 
         # load last weights
-        if last_path is not None and not self.trainer.testing:
-            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
-            model.load_state_dict(ckpt)
+        # if last_path is not None and not self.trainer.testing:
+        #     ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+        #     model.load_state_dict(ckpt)
 
         self.trainer.model = model
 

From 4f726a7fdc74d074d11c5c52a04abf531d0174bd Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 13:54:01 +0200
Subject: [PATCH 28/55] wip

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 8 +++++---
 pytorch_lightning/trainer/distrib_data_parallel.py    | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 2f3436567a566..dbb54124a63e3 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -63,9 +63,9 @@ def teardown(self, model):
         # todo, pass also bets score
 
         # load last weights
-        # if last_path is not None and not self.trainer.testing:
-        #     ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
-        #     model.load_state_dict(ckpt)
+        if last_path is  not None and not self.trainer.testing:
+            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
 
         self.trainer.model = model
 
@@ -113,6 +113,8 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_que
         # save weights at the end of training
         self.__save_end_of_training_weights(model)
 
+        # for some reason the state is missing
+        self.trainer.distributed_backend = 'TPU'
         # persist info in spawn
         self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 10a1ea6b768d2..f8305136b7297 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -586,7 +586,7 @@ def ddp_train(self, process_idx, mp_queue, model, is_master=False, proc_offset=0
             return results
 
     def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
-        if self.distributed_backend not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
+        if self.distributed_backend.lower() not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
             return
 
         # track the best model path
@@ -596,6 +596,7 @@ def transfer_distrib_spawn_state_on_fit_end(self, model, mp_queue, results):
 
         if self.global_rank == 0 and mp_queue is not None:
             rank_zero_warn('cleaning up ddp environment...')
+            # todo, pass complete checkpoint as state dictionary
             mp_queue.put(best_model_path)
             mp_queue.put(results)
 

From 91d1f5600547b20ada76ae08b09eb867e2ab95da Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 14:41:13 +0200
Subject: [PATCH 29/55] wip

---
 .../accelerator_backends/tpu_backend.py       | 64 ++++++++++---------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index dbb54124a63e3..37feeafd33a82 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -82,7 +82,7 @@ def train(self, model: LightningModule):
         else:
             xmp.spawn(
                 self.tpu_train_in_process,
-                args=(model, self.mp_queue),
+                args=(model, self.trainer, self.mp_queue),
                 nprocs=self.trainer.tpu_cores,
                 start_method=self.start_method
             )
@@ -96,67 +96,69 @@ def __load_weights_on_main_process(self):
 
         self.trainer.model = model
 
-    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, mp_queue=None):
+    def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, trainer=None, mp_queue=None):
         """
         Here we are inside each individual process
         """
-        if not self.trainer.testing:
-            self.trainer.setup('fit')
+        if not trainer:
+            trainer = self.trainer
+        if not trainer.testing:
+            trainer.setup('fit')
             model.setup('fit')
 
         # setup TPU training
-        self.__setup_tpu_training(model)
+        self.__setup_tpu_training(model, trainer)
 
         # Run the pretrain routine
-        results = self.trainer.run_pretrain_routine(model)
+        results = trainer.run_pretrain_routine(model)
 
         # save weights at the end of training
-        self.__save_end_of_training_weights(model)
+        self.__save_end_of_training_weights(model, trainer)
 
-        # for some reason the state is missing
-        self.trainer.distributed_backend = 'TPU'
         # persist info in spawn
-        self.trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
+        trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
-    def __save_end_of_training_weights(self, model: LightningModule):
+    @staticmethod
+    def __save_end_of_training_weights(model: LightningModule, trainer=None):
         # when training ends on these platforms dump weights to get out of the main process
-        if self.trainer.on_colab_kaggle:
+        if trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
-            self.trainer.save_spawn_weights(model)
+            trainer.save_spawn_weights(model)
 
-    def __setup_tpu_training(self, model: LightningModule):
+    @staticmethod
+    def __setup_tpu_training(model: LightningModule, trainer=None):
         # use the default device from the process
         tpu_device = xm.xla_device()
 
         # if given an ordinal device, use this as the device
-        if self.trainer.tpu_id is not None:
-            tpu_device = xm.xla_device(self.trainer.tpu_id)
+        if trainer.tpu_id is not None:
+            tpu_device = xm.xla_device(trainer.tpu_id)
 
         # track the device and move model to it
-        self.trainer._device = tpu_device
-        model.to(self.trainer._device)
+        trainer._device = tpu_device
+        model.to(trainer._device)
 
         # get the appropriate tpu ranks
-        self.trainer.tpu_local_core_rank = xm.get_local_ordinal()
-        self.trainer.tpu_global_core_rank = xm.get_ordinal()
+        trainer.tpu_local_core_rank = xm.get_local_ordinal()
+        trainer.tpu_global_core_rank = xm.get_ordinal()
 
         # avoid duplicating progress bar
-        if self.trainer.tpu_global_core_rank != 0 and self.trainer.progress_bar_callback is not None:
-            self.trainer.progress_bar_callback.disable()
+        if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
+            trainer.progress_bar_callback.disable()
 
-        self.trainer.global_rank = self.trainer.tpu_local_core_rank
-        rank_zero_only.rank = self.trainer.global_rank
+        trainer.global_rank = trainer.tpu_local_core_rank
+        rank_zero_only.rank = trainer.global_rank
 
         # CHOOSE OPTIMIZER
         # allow for lr schedulers as well
-        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
-        self.trainer.optimizers = optimizers
-        self.trainer.lr_schedulers = lr_schedulers
-        self.trainer.optimizer_frequencies = optimizer_frequencies
+        optimizers, lr_schedulers, optimizer_frequencies = trainer.init_optimizers(model)
+        trainer.optimizers = optimizers
+        trainer.lr_schedulers = lr_schedulers
+        trainer.optimizer_frequencies = optimizer_frequencies
 
         # init 16 bit for TPU
-        if self.trainer.precision == 16:
+        if trainer.precision == 16:
             os.environ['XLA_USE_BF16'] = str(1)
 
-        log.info(f'INIT TPU local core: {self.trainer.tpu_local_core_rank},'
-                 f' global rank: {self.trainer.tpu_global_core_rank}')
+        log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},'
+                 f' global rank: {trainer.tpu_global_core_rank}')

From d6f1137a52d792c1365264651dddd80a75abc849 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 14:51:41 +0200
Subject: [PATCH 30/55] wip

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 8 ++++----
 tests/models/test_tpu.py                              | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 37feeafd33a82..6238fadee4a56 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -118,15 +118,15 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine
         # persist info in spawn
         trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
-    @staticmethod
-    def __save_end_of_training_weights(model: LightningModule, trainer=None):
+    @classmethod
+    def __save_end_of_training_weights(cls, model: LightningModule, trainer=None):
         # when training ends on these platforms dump weights to get out of the main process
         if trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
             trainer.save_spawn_weights(model)
 
-    @staticmethod
-    def __setup_tpu_training(model: LightningModule, trainer=None):
+    @classmethod
+    def __setup_tpu_training(cls, model: LightningModule, trainer=None):
         # use the default device from the process
         tpu_device = xm.xla_device()
 
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index b5377a3af3f66..397714820f54f 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -163,7 +163,6 @@ def long_train_loader():
     model.val_dataloader = long_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False)
-    assert os.environ.get('XLA_USE_BF16') == str(1), "XLA_USE_BF16 was not set in environment variables"
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")

From 9465404df9e41d6ec7319fb55b42888bc12a1266 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 15:11:07 +0200
Subject: [PATCH 31/55] docs

---
 docs/source/index.rst                                 | 2 +-
 pytorch_lightning/__init__.py                         | 2 +-
 pytorch_lightning/accelerator_backends/gpu_backend.py | 2 +-
 pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++--
 pytorch_lightning/core/__init__.py                    | 6 +++++-
 pytorch_lightning/core/decorators.py                  | 2 --
 tests/trainer/test_trainer.py                         | 2 +-
 7 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9ab0da242d330..4b1b7c697a6c8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -93,7 +93,7 @@ PyTorch Lightning Documentation
    weights_loading
    optimizers
    profiler
-   use_single_gpu
+   single_gpu
    sequences
    training_tricks
    transfer_learning
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index ee49879ee3665..5cb42370c28db 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -54,11 +54,11 @@
     # We are not importing the rest of the lightning during the build process, as it may not be compiled yet
 else:
     from pytorch_lightning.core import LightningDataModule, LightningModule, data_loader
+    from pytorch_lightning.core.step_result import TrainResult, EvalResult
     from pytorch_lightning.callbacks import Callback
     from pytorch_lightning.trainer import Trainer
     from pytorch_lightning.utilities.seed import seed_everything
     from pytorch_lightning import metrics
-    from pytorch_lightning.core.step_result import TrainResult, EvalResult
 
     __all__ = [
         'Trainer',
diff --git a/pytorch_lightning/accelerator_backends/gpu_backend.py b/pytorch_lightning/accelerator_backends/gpu_backend.py
index 72b9002ca5668..3b5f37671d9e8 100644
--- a/pytorch_lightning/accelerator_backends/gpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/gpu_backend.py
@@ -14,7 +14,7 @@
 
 import torch
 
-from pytorch_lightning import LightningModule
+from pytorch_lightning.core import LightningModule
 try:
     from apex import amp
 except ImportError:
diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 6238fadee4a56..3cebc7fbc81c6 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -15,8 +15,8 @@
 import os
 
 import torch
-
 import torch.multiprocessing as mp
+
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import rank_zero_info, rank_zero_only, rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -161,4 +161,5 @@ def __setup_tpu_training(cls, model: LightningModule, trainer=None):
             os.environ['XLA_USE_BF16'] = str(1)
 
         log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},'
-                 f' global rank: {trainer.tpu_global_core_rank}')
+                 f' global rank: {trainer.tpu_global_core_rank}'
+                 f' with XLA_USE_BF16={os.environ["XLA_USE_BF16"]}')
diff --git a/pytorch_lightning/core/__init__.py b/pytorch_lightning/core/__init__.py
index 8a7770752f951..43861ad6dd785 100644
--- a/pytorch_lightning/core/__init__.py
+++ b/pytorch_lightning/core/__init__.py
@@ -305,5 +305,9 @@ def training_step(self, batch, batch_idx):
 from pytorch_lightning.core.decorators import data_loader
 from pytorch_lightning.core.lightning import LightningModule
 
-__all__ = ['LightningDataModule', 'LightningModule', 'data_loader']
+__all__ = [
+    'LightningDataModule',
+    'LightningModule',
+    'data_loader',
+]
 # __call__ = __all__
diff --git a/pytorch_lightning/core/decorators.py b/pytorch_lightning/core/decorators.py
index b540c1b66bab3..6023de27d853b 100644
--- a/pytorch_lightning/core/decorators.py
+++ b/pytorch_lightning/core/decorators.py
@@ -1,8 +1,6 @@
 from functools import wraps
 from typing import Callable
 
-import torch
-
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import rank_zero_warn
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9eb8fc0f14051..04ea0ba88ee6e 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -897,7 +897,7 @@ def test_trainer_config(trainer_kwargs, expected):
     assert trainer.use_ddp2 is expected["use_ddp2"]
     assert trainer.num_gpus == expected["num_gpus"]
     assert trainer.on_gpu is expected["on_gpu"]
-    assert trainer.use_single_gpu is expected["single_gpu"]
+    assert trainer.use_single_gpu is expected["use_single_gpu"]
     assert trainer.num_processes == expected["num_processes"]
 
 

From 75dc5d27384e3ab6308349249261fdd8ba98223d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 15:23:01 +0200
Subject: [PATCH 32/55] typo

---
 tests/trainer/test_trainer.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 04ea0ba88ee6e..e11ed287f924d 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -814,79 +814,79 @@ def test_num_sanity_val_steps(tmpdir, limit_val_batches):
 @pytest.mark.parametrize("trainer_kwargs,expected", [
     pytest.param(
         dict(distributed_backend=None, gpus=None),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend="dp", gpus=None),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend="dp", gpus=None),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend="ddp", gpus=None),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend="ddp", num_processes=2, gpus=None),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2)
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2)
     ),
     pytest.param(
         dict(distributed_backend="ddp", num_nodes=2, gpus=None),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend="ddp_cpu", num_processes=2, gpus=None),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2)
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2)
     ),
     pytest.param(
         dict(distributed_backend="ddp2", gpus=None),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=1)
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=1)
     ),
     pytest.param(
         dict(distributed_backend=None, gpus=1),
-        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1),
+        dict(use_dp=False, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")]
     ),
     pytest.param(
         dict(distributed_backend="dp", gpus=1),
-        dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1),
+        dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")]
     ),
     pytest.param(
         dict(distributed_backend="ddp", gpus=1),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=1, on_gpu=True, single_gpu=True, num_processes=1),
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=1, on_gpu=True, use_single_gpu=True, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")]
     ),
     pytest.param(
         dict(distributed_backend="ddp_cpu", num_processes=2, gpus=1),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, single_gpu=False, num_processes=2),
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=0, on_gpu=False, use_single_gpu=False, num_processes=2),
         marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")]
     ),
     pytest.param(
         dict(distributed_backend="ddp2", gpus=1),
-        dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=1, on_gpu=True, single_gpu=False, num_processes=1),
+        dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=1, on_gpu=True, use_single_gpu=False, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() == 0, reason="GPU needed")]
     ),
     pytest.param(
         dict(distributed_backend=None, gpus=2),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2),
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=2),
         marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")]
     ),
     pytest.param(
         dict(distributed_backend="dp", gpus=2),
-        dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1),
+        dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")]
     ),
     pytest.param(
         dict(distributed_backend="ddp", gpus=2),
-        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2),
+        dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=2),
         marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")]
     ),
     pytest.param(
         dict(distributed_backend="ddp2", gpus=2),
-        dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1),
+        dict(use_dp=False, use_ddp=False, use_ddp2=True, num_gpus=2, on_gpu=True, use_single_gpu=False, num_processes=1),
         marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")]
     ),
 ])

From 5a7bfa08ebbab25b78954701aad3d703df9e42d4 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 15:33:14 +0200
Subject: [PATCH 33/55] tests

---
 .github/workflows/ci-testing.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
index 7c3095fc281a1..f08e54d136ef2 100644
--- a/.github/workflows/ci-testing.yml
+++ b/.github/workflows/ci-testing.yml
@@ -30,6 +30,10 @@ jobs:
           # TODO: temporary fix till hanging jobs on macOS for py38 is resolved
           - python-version: 3.8
             os: macOS-10.15
+          # TODO: temporary fix till pyYaml can be installed, see: https://github.com/actions/setup-python/issues/114
+          - python-version: 3.7
+            os: ubuntu-18.04
+            requires: 'minimal'
 
     # Timeout: https://stackoverflow.com/a/59076067/4521646
     timeout-minutes: 25

From 22dd8c0d48eb491b3e162f5f8a29648176387667 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 15:36:41 +0200
Subject: [PATCH 34/55] tests

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 3cebc7fbc81c6..17be12b624f27 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -162,4 +162,4 @@ def __setup_tpu_training(cls, model: LightningModule, trainer=None):
 
         log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},'
                  f' global rank: {trainer.tpu_global_core_rank}'
-                 f' with XLA_USE_BF16={os.environ["XLA_USE_BF16"]}')
+                 f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')

From e4f2088c7e77c1735dcae2840ce752df01eebf6b Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 15:57:30 +0200
Subject: [PATCH 35/55] tests

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 17be12b624f27..18615862497d2 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -63,7 +63,7 @@ def teardown(self, model):
         # todo, pass also bets score
 
         # load last weights
-        if last_path is  not None and not self.trainer.testing:
+        if last_path and not self.trainer.testing:
             ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
             model.load_state_dict(ckpt)
 
@@ -78,7 +78,7 @@ def train(self, model: LightningModule):
 
         # train
         if self.trainer.tpu_id is not None:
-            self.tpu_train_in_process(self.trainer.tpu_id, model)
+            self.tpu_train_in_process(self.trainer.tpu_id, model, self.trainer, self.mp_queue)
         else:
             xmp.spawn(
                 self.tpu_train_in_process,
@@ -118,15 +118,13 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine
         # persist info in spawn
         trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
-    @classmethod
-    def __save_end_of_training_weights(cls, model: LightningModule, trainer=None):
+    def __save_end_of_training_weights(self, model: LightningModule, trainer=None):
         # when training ends on these platforms dump weights to get out of the main process
         if trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
             trainer.save_spawn_weights(model)
 
-    @classmethod
-    def __setup_tpu_training(cls, model: LightningModule, trainer=None):
+    def __setup_tpu_training(self, model: LightningModule, trainer=None):
         # use the default device from the process
         tpu_device = xm.xla_device()
 

From d89de9b1e37eb96f7c3983ba08de6cb827c270f4 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 16:17:54 +0200
Subject: [PATCH 36/55] tests

---
 tests/models/test_tpu.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 397714820f54f..0a0436413d456 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -30,6 +30,7 @@ def test_model_tpu_cores_1(tmpdir):
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=1,
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -47,6 +48,7 @@ def test_model_tpu_index_1(tmpdir):
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=[1],
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -65,6 +67,7 @@ def test_model_tpu_index_5(tmpdir):
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=[5],
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -83,6 +86,7 @@ def test_model_tpu_cores_8(tmpdir):
         default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=8,
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -109,6 +113,7 @@ def test_model_16bit_tpu_cores_1(tmpdir):
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=1,
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -128,6 +133,7 @@ def test_model_16bit_tpu_index_1(tmpdir):
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=[1],
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -148,6 +154,7 @@ def test_model_16bit_tpu_cores_8(tmpdir):
         precision=16,
         progress_bar_refresh_rate=0,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=8,
         limit_train_batches=0.4,
         limit_val_batches=0.4
@@ -176,6 +183,7 @@ def test_model_16bit_tpu_index_5(tmpdir):
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
+        distributed_backend='tpu',
         tpu_cores=[5],
     )
     trainer.fit(model)
@@ -195,6 +203,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
+        distributed_backend='tpu',
         tpu_cores=[1],
     )
     trainer.fit(model)
@@ -213,6 +222,7 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
+        distributed_backend='tpu',
         tpu_cores=[5],
     )
     trainer.fit(model)
@@ -229,6 +239,7 @@ def test_dataloaders_passed_to_fit(tmpdir):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
+        distributed_backend='tpu',
         tpu_cores=8,
     )
     result = trainer.fit(
@@ -253,7 +264,10 @@ def test_tpu_id_to_be_as_expected(tpu_cores, expected_tpu_id):
 def test_tpu_misconfiguration():
     """Test if trainer.tpu_id is set as expected"""
     with pytest.raises(MisconfigurationException, match="`tpu_cores` can only be"):
-        Trainer(tpu_cores=[1, 8])
+        Trainer(
+            tpu_cores=[1, 8],
+            distributed_backend='tpu',
+        )
 
 
 # @patch('pytorch_lightning.trainer.trainer.XLA_AVAILABLE', False)
@@ -266,6 +280,7 @@ def test_exception_when_no_tpu_found(tmpdir):
         max_epochs=1,
         train_percent_check=0.4,
         val_percent_check=0.2,
+        distributed_backend='tpu',
         tpu_cores=8,
     )
 

From 161472432c4c7dc5e2950ae3e43378e46b498e70 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 16:31:38 +0200
Subject: [PATCH 37/55] tests

---
 pytorch_lightning/trainer/distrib_data_parallel.py |  2 +-
 tests/models/test_tpu.py                           | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index f8305136b7297..2546ec7bb3c7d 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -251,7 +251,7 @@ def is_function_implemented(self, *args) -> bool:
 
     def init_tpu(self):
         # turn off all the GPU stuff
-        self.distributed_backend = None
+        # self.distributed_backend = 'tpu'
 
         # enable tpu
         self.use_tpu = True
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 0a0436413d456..24b470381231e 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -33,7 +33,7 @@ def test_model_tpu_cores_1(tmpdir):
         distributed_backend='tpu',
         tpu_cores=1,
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -51,7 +51,7 @@ def test_model_tpu_index_1(tmpdir):
         distributed_backend='tpu',
         tpu_cores=[1],
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -70,7 +70,7 @@ def test_model_tpu_index_5(tmpdir):
         distributed_backend='tpu',
         tpu_cores=[5],
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -89,7 +89,7 @@ def test_model_tpu_cores_8(tmpdir):
         distributed_backend='tpu',
         tpu_cores=8,
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -116,7 +116,7 @@ def test_model_16bit_tpu_cores_1(tmpdir):
         distributed_backend='tpu',
         tpu_cores=1,
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -136,7 +136,7 @@ def test_model_16bit_tpu_index_1(tmpdir):
         distributed_backend='tpu',
         tpu_cores=[1],
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()
@@ -157,7 +157,7 @@ def test_model_16bit_tpu_cores_8(tmpdir):
         distributed_backend='tpu',
         tpu_cores=8,
         limit_train_batches=0.4,
-        limit_val_batches=0.4
+        limit_val_batches=0.4,
     )
 
     model = EvalModelTemplate()

From def846bdb47fe6246d114247fce4d79e541a3b12 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 17:12:16 +0200
Subject: [PATCH 38/55] tests

---
 .github/workflows/tpu-testing.yml |  4 ++--
 tests/models/test_tpu.py          | 29 ++++++++++++++++-------------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml
index f5a08580ea737..6055696635bcf 100644
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@@ -14,8 +14,8 @@ env:
   GKE_CLUSTER: lightning-cluster
   GKE_ZONE: us-central1-a
   IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
-  MAX_CHECKS: 60
-  CHECK_SPEEP: 30
+  MAX_CHECKS: 180
+  CHECK_SPEEP: 10
 
 jobs:
   setup-build-publish-deploy:
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 24b470381231e..652f09762cbd0 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -22,6 +22,17 @@
     TPU_AVAILABLE = True
 
 
+# 8 cores needs a big dataset
+def _long_train_loader():
+    dataset = DataLoader(TrialMNIST(
+        download=True,
+        num_samples=2000,
+        digits=(0, 1, 2, 5, 8)),
+        batch_size=32,
+    )
+    return dataset
+
+
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_model_tpu_cores_1(tmpdir):
@@ -93,13 +104,9 @@ def test_model_tpu_cores_8(tmpdir):
     )
 
     model = EvalModelTemplate()
-
     # 8 cores needs a big dataset
-    def long_train_loader():
-        dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32)
-        return dataset
-    model.train_dataloader = long_train_loader
-    model.val_dataloader = long_train_loader
+    model.train_dataloader = _long_train_loader
+    model.val_dataloader = _long_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
@@ -161,15 +168,11 @@ def test_model_16bit_tpu_cores_8(tmpdir):
     )
 
     model = EvalModelTemplate()
-
     # 8 cores needs a big dataset
-    def long_train_loader():
-        dataset = DataLoader(TrialMNIST(download=True, num_samples=2000, digits=(0, 1, 2, 5, 8)), batch_size=32)
-        return dataset
-    model.train_dataloader = long_train_loader
-    model.val_dataloader = long_train_loader
+    model.train_dataloader = _long_train_loader
+    model.val_dataloader = _long_train_loader
 
-    tpipes.run_model_test(trainer_options, model, on_gpu=False)
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")

From f6060ec802aea59bdeff5fe67f4fae8e719c47e1 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 17:41:44 +0200
Subject: [PATCH 39/55] tests

---
 .circleci/config.yml              | 8 +++++---
 .github/workflows/tpu-testing.yml | 3 ++-
 tests/base/datasets.py            | 5 ++++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a515ef544f565..34d92c3f57801 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -62,10 +62,11 @@ references:
        # happened to the job in Kubernetes. If we try MAX_CHECKS times and
        # still the job hasn't finished, give up and return the starting
        # non-zero status code.
-       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+       printf "Job not finished yet" && \
+       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \
        echo "Done waiting. Job status code: $status_code" && \
        # Allow time for logs to flush.
-       sleep 30 && \
+       sleep $CHECK_SPEEP && \
        echo "JOB_NAME: $job_name" && \
        gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
        if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
@@ -101,7 +102,8 @@ jobs:
     docker:
       - image: circleci/python:3.7
     environment:
-      - MAX_CHECKS: 60
+      - MAX_CHECKS: 180
+      - CHECK_SPEEP: 10
     steps:
       - checkout
       - go/install
diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml
index 6055696635bcf..b4233ad69a9f7 100644
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@@ -90,7 +90,8 @@ jobs:
         # happened to the job in Kubernetes. If we try MAX_CHECKS times and
         # still the job hasn't finished, give up and return the starting
         # non-zero status code.
-        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+        printf "Job not finished yet" && \
+        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \
         echo "Done waiting. Job status code: $status_code" && \
         # Allow time for logs to flush.
         sleep $CHECK_SPEEP && \
diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index e9b630d1da7c9..5e2737b39c03f 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -107,7 +107,7 @@ def _download(self, data_folder: str) -> None:
 
 def _try_load(path_data, trials: int = 30, delta: float = 0.3):
     res = None
-    assert os.path.isfile(path_data)
+    assert os.path.isfile(path_data), 'missing file: %s' % path_data
     for _ in range(trials):
         try:
             res = torch.load(path_data)
@@ -115,6 +115,9 @@ def _try_load(path_data, trials: int = 30, delta: float = 0.3):
             time.sleep(delta)
         else:
             break
+    else:
+        import traceback
+        traceback.print_exc()
     return res
 
 

From f0a61746dbd91966c54d988bfde776e4f8b6bf50 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:08:22 +0200
Subject: [PATCH 40/55] tests

---
 docs/source/new-project.rst | 2 --
 tests/base/datasets.py      | 5 ++---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst
index 043461fca0b89..dd28bc311d401 100644
--- a/docs/source/new-project.rst
+++ b/docs/source/new-project.rst
@@ -6,8 +6,6 @@
     import torch
     from torch.nn import functional as F
     from torch.utils.data import DataLoader
-    from torchvision.datasets import MNIST
-    from torchvision import transforms
 
 
 Quick Start
diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index 5e2737b39c03f..b01d3385c45fa 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -111,13 +111,12 @@ def _try_load(path_data, trials: int = 30, delta: float = 0.3):
     for _ in range(trials):
         try:
             res = torch.load(path_data)
-        except Exception:
+        except Exception as ex:
             time.sleep(delta)
         else:
             break
     else:
-        import traceback
-        traceback.print_exc()
+        raise ex
     return res
 
 

From bdaaaefcc8a8a9e461a957b3db1118dee3095603 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:18:22 +0200
Subject: [PATCH 41/55] tests

---
 .circleci/config.yml              | 2 +-
 .github/workflows/tpu-testing.yml | 2 +-
 tests/models/test_grad_norm.py    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 34d92c3f57801..da4674dfda515 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -62,7 +62,7 @@ references:
        # happened to the job in Kubernetes. If we try MAX_CHECKS times and
        # still the job hasn't finished, give up and return the starting
        # non-zero status code.
-       printf "Job not finished yet" && \
+       printf "Waiting for job to finish: " && \
        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \
        echo "Done waiting. Job status code: $status_code" && \
        # Allow time for logs to flush.
diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml
index b4233ad69a9f7..011bb727cd9cb 100644
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@@ -90,7 +90,7 @@ jobs:
         # happened to the job in Kubernetes. If we try MAX_CHECKS times and
         # still the job hasn't finished, give up and return the starting
         # non-zero status code.
-        printf "Job not finished yet" && \
+        printf "Waiting for job to finish: " && \
         while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \
         echo "Done waiting. Job status code: $status_code" && \
         # Allow time for logs to flush.
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index 7483167755a8f..dc7eee557d448 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -42,8 +42,8 @@ def on_after_backward(self):
         self.stored_grad_norms.append(out)
 
 
-@pytest.mark.parametrize("norm_type", [1., 1.25, 1.5, 2, 3, 5, 10, 'inf'])
-def test_grad_tracking(tmpdir, norm_type, rtol=1e-2):
+@pytest.mark.parametrize("norm_type", [1., 1.25, 2, 3, 5, 10, 'inf'])
+def test_grad_tracking(tmpdir, norm_type, rtol=5e-3):
     os.environ['PL_DEV_DEBUG'] = '1'
 
     # rtol=5e-3 respects the 3 decimals rounding in `.grad_norms` and above

From 458471da880839b00ce1e920fd20e7c88506c52c Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:22:28 +0200
Subject: [PATCH 42/55] tests

---
 tests/base/datasets.py   | 5 +++--
 tests/models/test_tpu.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index b01d3385c45fa..b29984438f84b 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import random
 import urllib.request
 import time
 from typing import Tuple, Optional, Sequence
@@ -105,14 +106,14 @@ def _download(self, data_folder: str) -> None:
             urllib.request.urlretrieve(url, fpath)
 
 
-def _try_load(path_data, trials: int = 30, delta: float = 0.3):
+def _try_load(path_data, trials: int = 30, delta: float = 1.):
     res = None
     assert os.path.isfile(path_data), 'missing file: %s' % path_data
     for _ in range(trials):
         try:
             res = torch.load(path_data)
         except Exception as ex:
-            time.sleep(delta)
+            time.sleep(delta * random.random())
         else:
             break
     else:
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 652f09762cbd0..f96be7e77fce9 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -26,9 +26,9 @@
 def _long_train_loader():
     dataset = DataLoader(TrialMNIST(
         download=True,
-        num_samples=2000,
+        num_samples=500,
         digits=(0, 1, 2, 5, 8)),
-        batch_size=32,
+        batch_size=16,
     )
     return dataset
 

From 6e6588a6d53dfeb65becc63669ad7d2b6f15f032 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:27:03 +0200
Subject: [PATCH 43/55] tests

---
 tests/models/test_tpu.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index f96be7e77fce9..704fbe214e521 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -12,6 +12,8 @@
 
 try:
     import torch_xla
+    import torch_xla.distributed.xla_multiprocessing as xmp
+    SERIAL_EXEC = xmp.MpSerialExecutor()
     # TODO: The tests are aborted if the following lines are uncommented. Must be resolved with XLA team
     # device = torch_xla.core.xla_model.xla_device()
     # device_type = torch_xla.core.xla_model.xla_device_hw(device)
@@ -24,11 +26,15 @@
 
 # 8 cores needs a big dataset
 def _long_train_loader():
-    dataset = DataLoader(TrialMNIST(
-        download=True,
-        num_samples=500,
-        digits=(0, 1, 2, 5, 8)),
-        batch_size=16,
+    # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb
+    dataset = DataLoader(
+        SERIAL_EXEC.run(
+            TrialMNIST(
+                download=True,
+                num_samples=2000,
+                digits=(0, 1, 2, 5, 8)),
+                batch_size=32,
+        )
     )
     return dataset
 

From 6dc0e85479c4ac9286b91ce54afe397f8f1eeed2 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:45:57 +0200
Subject: [PATCH 44/55] tests

---
 tests/base/datasets.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index b29984438f84b..d92d5a35e088b 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -107,17 +107,21 @@ def _download(self, data_folder: str) -> None:
 
 
 def _try_load(path_data, trials: int = 30, delta: float = 1.):
-    res = None
+    res, exp = None, None
+    assert trials, "at least some trial has to be set"
     assert os.path.isfile(path_data), 'missing file: %s' % path_data
     for _ in range(trials):
         try:
             res = torch.load(path_data)
         except Exception as ex:
+            exp = ex
             time.sleep(delta * random.random())
         else:
             break
     else:
-        raise ex
+        # raise the caught exception if any
+        if exp:
+            raise exp
     return res
 
 

From 6196724b13fdb48e75c2d9e55415a4a66e8c89fb Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:50:09 +0200
Subject: [PATCH 45/55] tests

---
 tests/models/test_tpu.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 704fbe214e521..130781af5d761 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -32,9 +32,10 @@ def _long_train_loader():
             TrialMNIST(
                 download=True,
                 num_samples=2000,
-                digits=(0, 1, 2, 5, 8)),
-                batch_size=32,
-        )
+                digits=(0, 1, 2, 5, 8),
+            )
+        ),
+        batch_size=32,
     )
     return dataset
 

From 183fcc2d851a9f2c84e2857b17696b41bb136cfc Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 18:56:58 +0200
Subject: [PATCH 46/55] tests

---
 tests/models/test_tpu.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 130781af5d761..c56ec4010bd83 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -28,12 +28,10 @@
 def _long_train_loader():
     # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb
     dataset = DataLoader(
-        SERIAL_EXEC.run(
-            TrialMNIST(
-                download=True,
-                num_samples=2000,
-                digits=(0, 1, 2, 5, 8),
-            )
+        TrialMNIST(
+            download=True,
+            num_samples=2000,
+            digits=(0, 1, 2, 5, 8),
         ),
         batch_size=32,
     )
@@ -112,8 +110,8 @@ def test_model_tpu_cores_8(tmpdir):
 
     model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _long_train_loader
-    model.val_dataloader = _long_train_loader
+    model.train_dataloader = SERIAL_EXEC.run(_long_train_loader)
+    model.val_dataloader = SERIAL_EXEC.run(_long_train_loader)
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
@@ -176,8 +174,8 @@ def test_model_16bit_tpu_cores_8(tmpdir):
 
     model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = _long_train_loader
-    model.val_dataloader = _long_train_loader
+    model.train_dataloader = SERIAL_EXEC.run(_long_train_loader)
+    model.val_dataloader = SERIAL_EXEC.run(_long_train_loader)
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 

From e7a5295652dd4f89ea4d18daa1aac5d4362d3374 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 19:07:20 +0200
Subject: [PATCH 47/55] tests

---
 tests/models/test_tpu.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index c56ec4010bd83..ee17e2162ead3 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -25,14 +25,18 @@
 
 
 # 8 cores needs a big dataset
-def _long_train_loader():
+def _serial_train_loader():
     # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb
-    dataset = DataLoader(
-        TrialMNIST(
+
+    def _get_dataset():
+        return TrialMNIST(
             download=True,
             num_samples=2000,
             digits=(0, 1, 2, 5, 8),
-        ),
+        )
+
+    dataset = DataLoader(
+        SERIAL_EXEC.run(_get_dataset),
         batch_size=32,
     )
     return dataset
@@ -110,8 +114,8 @@ def test_model_tpu_cores_8(tmpdir):
 
     model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = SERIAL_EXEC.run(_long_train_loader)
-    model.val_dataloader = SERIAL_EXEC.run(_long_train_loader)
+    model.train_dataloader = _serial_train_loader
+    model.val_dataloader = _serial_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
@@ -174,8 +178,8 @@ def test_model_16bit_tpu_cores_8(tmpdir):
 
     model = EvalModelTemplate()
     # 8 cores needs a big dataset
-    model.train_dataloader = SERIAL_EXEC.run(_long_train_loader)
-    model.val_dataloader = SERIAL_EXEC.run(_long_train_loader)
+    model.train_dataloader = _serial_train_loader
+    model.val_dataloader = _serial_train_loader
 
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 

From 199919883e32c49a1341e090f38c2973e4266454 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 21:33:17 +0200
Subject: [PATCH 48/55] tests

---
 tests/models/test_tpu.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index ee17e2162ead3..89d5dce279840 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -24,22 +24,19 @@
     TPU_AVAILABLE = True
 
 
-# 8 cores needs a big dataset
-def _serial_train_loader():
-    # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/resnet18-training.ipynb
+_LARGER_DATASET = TrialMNIST(
+    download=True,
+    num_samples=2000,
+    digits=(0, 1, 2, 5, 8),
+)
 
-    def _get_dataset():
-        return TrialMNIST(
-            download=True,
-            num_samples=2000,
-            digits=(0, 1, 2, 5, 8),
-        )
 
-    dataset = DataLoader(
-        SERIAL_EXEC.run(_get_dataset),
+# 8 cores needs a big dataset
+def _serial_train_loader():
+    return DataLoader(
+        _LARGER_DATASET,
         batch_size=32,
     )
-    return dataset
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")

From 9579bce6ce248ca27653311be1d510b9b3590463 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 21:54:09 +0200
Subject: [PATCH 49/55] tests

---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 18615862497d2..7591f6e22e31c 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -46,7 +46,8 @@ def setup(self):
             raise MisconfigurationException('PyTorch XLA not installed.')
 
         #  COLAB_GPU is an env var available by default in Colab environments.
-        self.start_method = 'fork' if self.trainer.on_colab_kaggle else 'spawn'
+        # see: https://discuss.pytorch.org/t/segfault-with-multiprocessing-queue/81292/2
+        self.start_method = 'fork'
 
         # pass in a state q
         smp = mp.get_context(self.start_method)

From fde01de908bcec61202bac0a0e196b2c2dc7539f Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 22:08:25 +0200
Subject: [PATCH 50/55] docs

---
 CHANGELOG.md                                          | 2 ++
 pytorch_lightning/accelerator_backends/tpu_backend.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index af692fdd0f938..e65efa50cb38d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed `weights_save_path` getting ignored when `logger=False` is passed to Trainer ([#2681](https://github.com/PyTorchLightning/pytorch-lightning/pull/2681))
 
+- Fixed TPU multi-core and Float16 ([#2632](https://github.com/PyTorchLightning/pytorch-lightning/pull/2632))
+
 ## [0.8.5] - 2020-07-09
 
 ### Added
diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 7591f6e22e31c..b40ae3c23e7f8 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -45,7 +45,6 @@ def setup(self):
         if not XLA_AVAILABLE:
             raise MisconfigurationException('PyTorch XLA not installed.')
 
-        #  COLAB_GPU is an env var available by default in Colab environments.
         # see: https://discuss.pytorch.org/t/segfault-with-multiprocessing-queue/81292/2
         self.start_method = 'fork'
 

From 6abee2084984b8d617d52be40b5a4cacf9c17a20 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 22:22:18 +0200
Subject: [PATCH 51/55] docs

---
 .circleci/config.yml              | 8 ++++----
 .github/workflows/tpu-testing.yml | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index da4674dfda515..ba24d3bb82086 100755
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -63,10 +63,10 @@ references:
        # still the job hasn't finished, give up and return the starting
        # non-zero status code.
        printf "Waiting for job to finish: " && \
-       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep 30; done && \
+       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
        echo "Done waiting. Job status code: $status_code" && \
        # Allow time for logs to flush.
-       sleep $CHECK_SPEEP && \
+       sleep 10 && \
        echo "JOB_NAME: $job_name" && \
        gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
        if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
@@ -102,8 +102,8 @@ jobs:
     docker:
       - image: circleci/python:3.7
     environment:
-      - MAX_CHECKS: 180
-      - CHECK_SPEEP: 10
+      - MAX_CHECKS: 240
+      - CHECK_SPEEP: 5
     steps:
       - checkout
       - go/install
diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml
index 011bb727cd9cb..245e424181007 100644
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@@ -14,8 +14,8 @@ env:
   GKE_CLUSTER: lightning-cluster
   GKE_ZONE: us-central1-a
   IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
-  MAX_CHECKS: 180
-  CHECK_SPEEP: 10
+  MAX_CHECKS: 240
+  CHECK_SPEEP: 5
 
 jobs:
   setup-build-publish-deploy:
@@ -91,10 +91,10 @@ jobs:
         # still the job hasn't finished, give up and return the starting
         # non-zero status code.
         printf "Waiting for job to finish: " && \
-        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep 30; done && \
+        while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
         echo "Done waiting. Job status code: $status_code" && \
         # Allow time for logs to flush.
-        sleep $CHECK_SPEEP && \
+        sleep 10 && \
         echo "JOB_NAME: $job_name" && \
         echo "GKE_CLUSTER: $GKE_CLUSTER" && \
         echo "GKE_ZONE: $GKE_ZONE" && \

From bf6ac749eefa2ca2c07802fcb28934f01b7979ab Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 27 Jul 2020 22:31:44 +0200
Subject: [PATCH 52/55] Apply suggestions from code review

Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>
Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index b40ae3c23e7f8..fac8594f0358f 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -124,14 +124,15 @@ def __save_end_of_training_weights(self, model: LightningModule, trainer=None):
             rank_zero_warn('cleaning up... please do not interrupt')
             trainer.save_spawn_weights(model)
 
-    def __setup_tpu_training(self, model: LightningModule, trainer=None):
+    def __setup_tpu_training(self, model: LightningModule, trainer):
         # use the default device from the process
         tpu_device = xm.xla_device()
 
         # if given an ordinal device, use this as the device
         if trainer.tpu_id is not None:
             tpu_device = xm.xla_device(trainer.tpu_id)
-
+        else:
+            tpu_device = xm.xla_device()
         # track the device and move model to it
         trainer._device = tpu_device
         model.to(trainer._device)

From 6f30bc56d1cd1561ec77a3e7fa12b41382bd4a37 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 27 Jul 2020 22:56:25 +0200
Subject: [PATCH 53/55] Apply suggestions from code review

Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>
---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index fac8594f0358f..95203550eb37e 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -118,7 +118,7 @@ def tpu_train_in_process(self, tpu_core_idx: int, model: LightningModule, traine
         # persist info in spawn
         trainer.transfer_distrib_spawn_state_on_fit_end(model, mp_queue, results)
 
-    def __save_end_of_training_weights(self, model: LightningModule, trainer=None):
+    def __save_end_of_training_weights(self, model: LightningModule, trainer):
         # when training ends on these platforms dump weights to get out of the main process
         if trainer.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')

From 45233a50870c0de317fab1c999e82c005cb80d4d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 27 Jul 2020 23:17:04 +0200
Subject: [PATCH 54/55] docs

---
 tests/base/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/base/datasets.py b/tests/base/datasets.py
index d92d5a35e088b..27e614eee68cf 100644
--- a/tests/base/datasets.py
+++ b/tests/base/datasets.py
@@ -63,7 +63,6 @@ def __init__(self, root: str = PATH_DATASETS, train: bool = True,
             raise RuntimeError('Dataset not found.')
 
         data_file = self.TRAIN_FILE_NAME if self.train else self.TEST_FILE_NAME
-        # FIXME: try to fix loading
         self.data, self.targets = _try_load(os.path.join(self.cached_folder_path, data_file))
 
     def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
@@ -107,6 +106,7 @@ def _download(self, data_folder: str) -> None:
 
 
 def _try_load(path_data, trials: int = 30, delta: float = 1.):
+    """Resolving loading from the same time from multiple concurrentprocesses."""
     res, exp = None, None
     assert trials, "at least some trial has to be set"
     assert os.path.isfile(path_data), 'missing file: %s' % path_data

From 6c9495e04bbe602cb662fac8376c77dfe2280922 Mon Sep 17 00:00:00 2001
From: Jirka Borovec <Borda@users.noreply.github.com>
Date: Mon, 27 Jul 2020 23:05:36 +0200
Subject: [PATCH 55/55] Apply suggestions from code review

Co-authored-by: Rohit Gupta <rohitgr1998@gmail.com>
---
 pytorch_lightning/accelerator_backends/tpu_backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/accelerator_backends/tpu_backend.py b/pytorch_lightning/accelerator_backends/tpu_backend.py
index 95203550eb37e..8d1d1b271b7dc 100644
--- a/pytorch_lightning/accelerator_backends/tpu_backend.py
+++ b/pytorch_lightning/accelerator_backends/tpu_backend.py
@@ -126,7 +126,7 @@ def __save_end_of_training_weights(self, model: LightningModule, trainer):
 
     def __setup_tpu_training(self, model: LightningModule, trainer):
         # use the default device from the process
-        tpu_device = xm.xla_device()
+        # tpu_device = xm.xla_device()
 
         # if given an ordinal device, use this as the device
         if trainer.tpu_id is not None: