From b4687590e8216ef7beb43fe00a0db274325b55fd Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 31 Aug 2021 10:33:48 +0100 Subject: [PATCH 1/7] Add a warning to deepspeed to ensure that we let the user know when we auto-infer the batch size when running --- .../plugins/training_type/deepspeed.py | 5 ++ tests/plugins/test_deepspeed_plugin.py | 56 ++++++++++++++++++- 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 94fb868d1c646..1b066bbff3038 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -562,6 +562,11 @@ def _format_batch_size_and_grad_accum_config(self): " as this will be set via accumulate_grad_batches=x argument passed via the Lightning Trainer." ) if "train_micro_batch_size_per_gpu" not in self.config: + rank_zero_warn( + "Inferring the batch size for internal deepspeed logging from the ``train_dataloader()``. " + "If you require skipping this, please pass " + "``Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=X)``, where X is the batch size." + ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index a5e4e1d189aaa..ae481ed427496 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,6 +1,6 @@ import json import os -from typing import Any, Dict +from typing import Any, Dict, Optional from unittest import mock import pytest @@ -11,7 +11,7 @@ from torch.utils.data import DataLoader from torchmetrics import Accuracy -from pytorch_lightning import LightningModule, seed_everything, Trainer +from pytorch_lightning import LightningDataModule, LightningModule, seed_everything, Trainer from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from pytorch_lightning.plugins import DeepSpeedPlugin, DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.training_type.deepspeed import LightningDeepSpeedModule @@ -830,3 +830,55 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir): trainer.fit(model) _assert_save_model_is_equal(model, tmpdir, trainer) + + +@RunIf(min_gpus=1, deepspeed=True, special=True) +def test_deepspeed_warn_train_dataloader_called(tmpdir): + """ + Test DeepSpeed warns when it calls ``train_dataloader`` internally for logging batch size. + """ + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + plugins=[DeepSpeedPlugin()], + gpus=1, + fast_dev_run=True, + ) + with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"): + trainer.fit(model) + + +@RunIf(min_gpus=1, deepspeed=True, special=True) +def test_deepspeed_setup_train_dataloader(tmpdir): + """ + Test DeepSpeed works when setup is required to call, and the user passes the batch size manually. + """ + + class PlDataModule(LightningDataModule): + def __init__(self): + super().__init__() + self._setup = False + + def setup(self, stage: Optional[str] = None) -> None: + self._setup = True + + def train_dataloader(self): + assert self._setup + return DataLoader(RandomDataset(32, 64), batch_size=2) + + def val_dataloader(self): + assert self._setup + return DataLoader(RandomDataset(32, 64), batch_size=2) + + def test_dataloader(self): + assert self._setup + return DataLoader(RandomDataset(32, 64), batch_size=2) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + plugins=[DeepSpeedPlugin(logging_batch_size_per_gpu=32)], + gpus=1, + fast_dev_run=True, + ) + trainer.fit(model, datamodule=PlDataModule()) From 58099b1b608149b0edd9da2aef8c7c66c0318dfc Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 31 Aug 2021 10:35:38 +0100 Subject: [PATCH 2/7] Add CHANGELOG.md --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f48f1c3f7104..0274aed1d9434 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for CPU AMP autocast ([#9084](https://github.com/PyTorchLightning/pytorch-lightning/pull/9084)) +- Add a warning to deepspeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221)) + + ### Changed - Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices. ([#8770](https://github.com/PyTorchLightning/pytorch-lightning/pull/8770)) From 33c0e09c3eacd9073bdcb6daf205f2bdcaa4ece3 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 7 Sep 2021 11:04:45 +0100 Subject: [PATCH 3/7] Update pytorch_lightning/plugins/training_type/deepspeed.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 1b066bbff3038..599c1ebf9a1b9 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -563,9 +563,9 @@ def _format_batch_size_and_grad_accum_config(self): ) if "train_micro_batch_size_per_gpu" not in self.config: rank_zero_warn( - "Inferring the batch size for internal deepspeed logging from the ``train_dataloader()``. " + "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. " "If you require skipping this, please pass " - "``Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=X)``, where X is the batch size." + "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size)`" ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size From f26f0a34934735016dc36dd02c0558a0a78b8ed9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 7 Sep 2021 11:14:30 +0100 Subject: [PATCH 4/7] Update test to ensure we check test --- tests/plugins/test_deepspeed_plugin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index ae481ed427496..9617c5c96155e 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -854,7 +854,7 @@ def test_deepspeed_setup_train_dataloader(tmpdir): Test DeepSpeed works when setup is required to call, and the user passes the batch size manually. """ - class PlDataModule(LightningDataModule): + class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): super().__init__() self._setup = False @@ -881,4 +881,5 @@ def test_dataloader(self): gpus=1, fast_dev_run=True, ) - trainer.fit(model, datamodule=PlDataModule()) + trainer.fit(model, datamodule=TestSetupIsCalledDataModule()) + trainer.test(model) From d47188e7e1c09c24756a69fb5893d93d3052752f Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 7 Sep 2021 15:37:50 +0100 Subject: [PATCH 5/7] Pre-commit --- tests/plugins/test_deepspeed_plugin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 9617c5c96155e..dd052f9d09f56 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -834,9 +834,7 @@ def test_deepspeed_multigpu_no_schedulers(tmpdir): @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_warn_train_dataloader_called(tmpdir): - """ - Test DeepSpeed warns when it calls ``train_dataloader`` internally for logging batch size. - """ + """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch size.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, From 5f966ec02c4eece0bdbfe13cdb6537261cb46200 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Sep 2021 14:41:33 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/plugins/test_deepspeed_plugin.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 02d04aa611d78..de4bb3ea987f9 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -812,7 +812,8 @@ def training_step(self, batch, batch_idx): @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_warn_train_dataloader_called(tmpdir): - """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch size.""" + """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch + size.""" model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, @@ -826,9 +827,7 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir): @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_setup_train_dataloader(tmpdir): - """ - Test DeepSpeed works when setup is required to call, and the user passes the batch size manually. - """ + """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): From 7b1c15f10f8b9de4c1c87d9c06a2c7cd6e0288e2 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 7 Sep 2021 16:56:44 +0100 Subject: [PATCH 7/7] Update pytorch_lightning/plugins/training_type/deepspeed.py Co-authored-by: Ethan Harris --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a4173b57f0b8d..ca10b47bd9fd2 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -549,7 +549,7 @@ def _format_batch_size_and_grad_accum_config(self): rank_zero_warn( "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. " "If you require skipping this, please pass " - "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size)`" + "`Trainer(plugins=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size