From b4687590e8216ef7beb43fe00a0db274325b55fd Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 31 Aug 2021 10:33:48 +0100
Subject: [PATCH 1/7] Add a warning to deepspeed to ensure that we let the user
 know when we auto-infer the batch size when running

---
 .../plugins/training_type/deepspeed.py        |  5 ++
 tests/plugins/test_deepspeed_plugin.py        | 56 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 94fb868d1c646..1b066bbff3038 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -562,6 +562,11 @@ def _format_batch_size_and_grad_accum_config(self):
                 " as this will be set via accumulate_grad_batches=x argument passed via the Lightning Trainer."
             )
         if "train_micro_batch_size_per_gpu" not in self.config:
+            rank_zero_warn(
+                "Inferring the batch size for internal deepspeed logging from the ``train_dataloader()``. "
+                "If you require skipping this, please pass "
+                "``Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=X)``, where X is the batch size."
+            )
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size
         self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index a5e4e1d189aaa..ae481ed427496 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 from unittest import mock
 
 import pytest
@@ -11,7 +11,7 @@
 from torch.utils.data import DataLoader
 from torchmetrics import Accuracy
 
-from pytorch_lightning import LightningModule, seed_everything, Trainer
+from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer
 from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
 from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin
 from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule
@@ -830,3 +830,55 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
     trainer.fit(model)
 
     _assert_save_model_is_equal(model, tmpdir, trainer)
+
+
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_warn_train_dataloader_called(tmpdir):
+    """
+    Test DeepSpeed warns when it calls ``train_dataloader`` internally for logging batch size.
+    """
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        plugins=[DeepSpeedPlugin()],
+        gpus=1,
+        fast_dev_run=True,
+    )
+    with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"):
+        trainer.fit(model)
+
+
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_setup_train_dataloader(tmpdir):
+    """
+    Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.
+    """
+
+    class PlDataModule(LightningDataModule):
+        def __init__(self):
+            super().__init__()
+            self._setup = False
+
+        def setup(self, stage: Optional[str] = None) -> None:
+            self._setup = True
+
+        def train_dataloader(self):
+            assert self._setup
+            return DataLoader(RandomDataset(32, 64), batch_size=2)
+
+        def val_dataloader(self):
+            assert self._setup
+            return DataLoader(RandomDataset(32, 64), batch_size=2)
+
+        def test_dataloader(self):
+            assert self._setup
+            return DataLoader(RandomDataset(32, 64), batch_size=2)
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        plugins=[DeepSpeedPlugin(logging_batch_size_per_gpu=32)],
+        gpus=1,
+        fast_dev_run=True,
+    )
+    trainer.fit(model, datamodule=PlDataModule())

From 58099b1b608149b0edd9da2aef8c7c66c0318dfc Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 31 Aug 2021 10:35:38 +0100
Subject: [PATCH 2/7] Add CHANGELOG.md

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f48f1c3f7104..0274aed1d9434 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -100,6 +100,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Add support for CPU AMP autocast ([#9084](https://github.com/PyTorchLightning/pytorch-lightning/pull/9084))
 
 
+- Add a warning to deepspeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221))
+
+
 ### Changed
 
 - Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices. ([#8770](https://github.com/PyTorchLightning/pytorch-lightning/pull/8770))

From 33c0e09c3eacd9073bdcb6daf205f2bdcaa4ece3 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean@grid.ai>
Date: Tue, 7 Sep 2021 11:04:45 +0100
Subject: [PATCH 3/7] Update
 pytorch_lightning/plugins/training_type/deepspeed.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 1b066bbff3038..599c1ebf9a1b9 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -563,9 +563,9 @@ def _format_batch_size_and_grad_accum_config(self):
             )
         if "train_micro_batch_size_per_gpu" not in self.config:
             rank_zero_warn(
-                "Inferring the batch size for internal deepspeed logging from the ``train_dataloader()``. "
+                "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. "
                 "If you require skipping this, please pass "
-                "``Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=X)``, where X is the batch size."
+                "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size)`"
             )
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size

From f26f0a34934735016dc36dd02c0558a0a78b8ed9 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 7 Sep 2021 11:14:30 +0100
Subject: [PATCH 4/7] Update test to ensure we check test

---
 tests/plugins/test_deepspeed_plugin.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index ae481ed427496..9617c5c96155e 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -854,7 +854,7 @@ def test_deepspeed_setup_train_dataloader(tmpdir):
     Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.
     """
 
-    class PlDataModule(LightningDataModule):
+    class TestSetupIsCalledDataModule(LightningDataModule):
         def __init__(self):
             super().__init__()
             self._setup = False
@@ -881,4 +881,5 @@ def test_dataloader(self):
         gpus=1,
         fast_dev_run=True,
     )
-    trainer.fit(model, datamodule=PlDataModule())
+    trainer.fit(model, datamodule=TestSetupIsCalledDataModule())
+    trainer.test(model)

From d47188e7e1c09c24756a69fb5893d93d3052752f Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 7 Sep 2021 15:37:50 +0100
Subject: [PATCH 5/7] Pre-commit

---
 tests/plugins/test_deepspeed_plugin.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 9617c5c96155e..dd052f9d09f56 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -834,9 +834,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir):
 
 @RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_warn_train_dataloader_called(tmpdir):
-    """
-    Test DeepSpeed warns when it calls ``train_dataloader`` internally for logging batch size.
-    """
+    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch size."""
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,

From 5f966ec02c4eece0bdbfe13cdb6537261cb46200 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 7 Sep 2021 14:41:33 +0000
Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/plugins/test_deepspeed_plugin.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 02d04aa611d78..de4bb3ea987f9 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -812,7 +812,8 @@ def training_step(self, batch, batch_idx):
 
 @RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_warn_train_dataloader_called(tmpdir):
-    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch size."""
+    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch
+    size."""
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
@@ -826,9 +827,7 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir):
 
 @RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_setup_train_dataloader(tmpdir):
-    """
-    Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.
-    """
+    """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually."""
 
     class TestSetupIsCalledDataModule(LightningDataModule):
         def __init__(self):

From 7b1c15f10f8b9de4c1c87d9c06a2c7cd6e0288e2 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean@grid.ai>
Date: Tue, 7 Sep 2021 16:56:44 +0100
Subject: [PATCH 7/7] Update
 pytorch_lightning/plugins/training_type/deepspeed.py

Co-authored-by: Ethan Harris <ewah1g13@soton.ac.uk>
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index a4173b57f0b8d..ca10b47bd9fd2 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -549,7 +549,7 @@ def _format_batch_size_and_grad_accum_config(self):
             rank_zero_warn(
                 "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. "
                 "If you require skipping this, please pass "
-                "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size)`"
+                "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
             )
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size