From d9a942cb1ee8d6173adba6f7a76fefb1c987b4d5 Mon Sep 17 00:00:00 2001 From: camruta Date: Fri, 5 Mar 2021 15:13:55 -0800 Subject: [PATCH 1/8] Add teardown method to BaseProfiler class. --- a.out | Bin 0 -> 49424 bytes pytorch_lightning/profiler/profilers.py | 17 +++++++++++++++-- pytorch_lightning/profiler/pytorch.py | 5 ++++- 3 files changed, 19 insertions(+), 3 deletions(-) create mode 100755 a.out diff --git a/a.out b/a.out new file mode 100755 index 0000000000000000000000000000000000000000..2b2f324763ddafba09fc04dd9c28501de6790233 GIT binary patch literal 49424 zcmeI)zi%657{Kw@NeY%4weEuWQG}rc0Yg&|2!vS_fq?s%vrvV^ya@&qeExHxaD)wOc}dpay&xw-GhBlTVFC%ty(T9}Yu z@4K+xd>FTUGN#tn0g*xH^LD2x?apR5&8PR{$FQF5xrOEVHT;PGYx{azo5+QwwIzA~ z@}*V3yA{6=**@lbyWgzYUfDe-a;-bDt%t5x)VXO?{O3dF%s;)4!U?PEfi_=PIBDH5 zSC{5#RD74!m0hz)T$ldNR-+rscG5d^z5Tj|({<@@YsycgSx@TzyXk$s2FwQI;sZ7d%@jSjxf6FRZ1&F=5TLBF>cw;PMiow%vF z#WUKhj_1B#zIW%&`IEQrf4#CE*Ee6+b7gJ*8nvpz@$@*gEY@dU)iJy8vUpjq-`27J z7n5P%`?fY4v-%6?=k;<64{OnSR(JYIv~~8qvszcXp_4RT(q`*#jJ)S7AvZ4q0R#|0 z009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{ z1Q0*~0R#|0009ILK;Y;EKCbQlQM>t9sdnp~x3r2M+yi@}^|fDjtN-rat=;^k^x4mQ z ztC1bNsmCXZ&J~CLeFHY_WTB*gef~t6JGT+uM{wGo-+wj1i@AJO^EqwSTa=v~P?MxJ UW%WODg(B%Gy}ax7Budx$AE8atB>(^b literal 0 HcmV?d00001 diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index d704ba83236c1..943902e2fa979 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -55,6 +55,10 @@ def start(self, action_name: str) -> None: def stop(self, action_name: str) -> None: """Defines how to record the duration once an action is complete.""" + @abstractmethod + def teardown(self) -> None: + """Execute arbitrary post-profiling tear-down steps as defined by subclass.""" + @contextmanager def profile(self, action_name: str) -> None: """ @@ -114,6 +118,9 @@ def start(self, action_name: str) -> None: def stop(self, action_name: str) -> None: pass + def teardown(self) -> None: + pass + def summary(self) -> str: return "" @@ -214,11 +221,14 @@ def describe(self): if self.output_file: self.output_file.flush() - def __del__(self): + def teardown(self) -> None: """Close profiler's stream.""" if self.output_file: self.output_file.close() + def __del__(self): + self.teardown() + class AdvancedProfiler(BaseProfiler): """ @@ -286,7 +296,10 @@ def describe(self): if self.output_file: self.output_file.flush() - def __del__(self): + def teardown(self) -> None: """Close profiler's stream.""" if self.output_file: self.output_file.close() + + def __del__(self): + self.teardown() diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py index 88a33a3d367f8..71da6a67cae20 100644 --- a/pytorch_lightning/profiler/pytorch.py +++ b/pytorch_lightning/profiler/pytorch.py @@ -297,7 +297,10 @@ def describe(self): if self.output_file: self.output_file.flush() - def __del__(self): + def teardown(self) -> None: """Close profiler's stream.""" if self.output_file: self.output_file.close() + + def __del__(self): + self.teardown() From 709d4bf32b06d3955c891fe77404aa42629a557d Mon Sep 17 00:00:00 2001 From: camruta Date: Fri, 5 Mar 2021 15:29:12 -0800 Subject: [PATCH 2/8] Update the changelog. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 703a2c7eec19f..7f1dd73d39978 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274)) +- Added teardown method to BaseProfiler to enable subclasses define post-profiling steps outside of __del__ method. ### Changed From af608f05f6ea6f8f76c72d9679ad9fedd62aee3a Mon Sep 17 00:00:00 2001 From: camruta Date: Fri, 12 Mar 2021 15:09:13 -0800 Subject: [PATCH 3/8] Call profiler.teardown when trainer is exiting and add unit test. --- pytorch_lightning/profiler/profilers.py | 5 +--- pytorch_lightning/trainer/trainer.py | 1 + tests/trainer/test_trainer.py | 39 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index 943902e2fa979..b61e8dfa65f72 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -55,9 +55,9 @@ def start(self, action_name: str) -> None: def stop(self, action_name: str) -> None: """Defines how to record the duration once an action is complete.""" - @abstractmethod def teardown(self) -> None: """Execute arbitrary post-profiling tear-down steps as defined by subclass.""" + pass @contextmanager def profile(self, action_name: str) -> None: @@ -118,9 +118,6 @@ def start(self, action_name: str) -> None: def stop(self, action_name: str) -> None: pass - def teardown(self) -> None: - pass - def summary(self) -> str: return "" diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index c3039d24aadc0..05db57fa96176 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -480,6 +480,7 @@ def fit( self.call_hook('on_fit_end') # teardown + self.profiler.teardown() self.call_teardown_hook(model) if self.state != TrainerState.INTERRUPTED: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 5b06879b1f6d1..de6615d22292c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1603,6 +1603,45 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir): ) trainer.fit(model) +def test_profiler_teardown(tmpdir): + """ + This test checks if profiler teardown method is called when + trainer is exiting. + """ + profilerSimple = SimpleProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) + profilerAdvanced = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) + profilerPytorch = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler=profilerSimple, + ) + trainer.fit(model) + + assert profilerSimple.output_file.closed + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler=profilerAdvanced, + ) + trainer.fit(model) + + assert profilerAdvanced.output_file.closed + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler=profilerPytorch, + ) + trainer.fit(model) + + assert profilerPytorch.output_file.closed + @pytest.mark.parametrize( ["limit_train_batches", "global_step", "num_training_batches", "current_epoch", "should_train"], From 9ab304c449c2c6e9ede92ca7954013307da4b958 Mon Sep 17 00:00:00 2001 From: camruta Date: Fri, 12 Mar 2021 15:13:04 -0800 Subject: [PATCH 4/8] Formatting --- tests/trainer/test_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index de6615d22292c..31573d1fd677c 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1603,6 +1603,7 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir): ) trainer.fit(model) + def test_profiler_teardown(tmpdir): """ This test checks if profiler teardown method is called when From ffb5b2113363ccbf59fd57bc307410bfed7dc552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 22 Mar 2021 01:32:15 +0100 Subject: [PATCH 5/8] Apply suggestions from code review Co-authored-by: ananthsub --- tests/trainer/test_trainer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 31573d1fd677c..001a23ef2fe15 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1609,19 +1609,19 @@ def test_profiler_teardown(tmpdir): This test checks if profiler teardown method is called when trainer is exiting. """ - profilerSimple = SimpleProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) - profilerAdvanced = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) - profilerPytorch = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0) + simple_profiler = SimpleProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) + advanced_profiler = AdvancedProfiler(output_filename=os.path.join(tmpdir, "profiler.txt")) + pytorch_profiler = PyTorchProfiler(output_filename=os.path.join(tmpdir, "profiler.txt"), local_rank=0) model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, fast_dev_run=True, - profiler=profilerSimple, + profiler=simple_profiler, ) trainer.fit(model) - assert profilerSimple.output_file.closed + assert simple_profiler.output_file.closed model = BoringModel() trainer = Trainer( @@ -1631,7 +1631,7 @@ def test_profiler_teardown(tmpdir): ) trainer.fit(model) - assert profilerAdvanced.output_file.closed + assert advanced_profiler.output_file.closed model = BoringModel() trainer = Trainer( @@ -1641,7 +1641,7 @@ def test_profiler_teardown(tmpdir): ) trainer.fit(model) - assert profilerPytorch.output_file.closed + assert pytorch_profiler.output_file.closed @pytest.mark.parametrize( From d3b5adb81a69529d336a462798d4f30af5ce0bfa Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 22 Mar 2021 01:47:33 +0100 Subject: [PATCH 6/8] Fix CHANGELOG. Remove file. Use parametrize --- CHANGELOG.md | 19 ++++++++++++------- a.out | Bin 49424 -> 0 bytes pytorch_lightning/trainer/trainer.py | 2 +- tests/test_profiler.py | 18 ++++++++++++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) delete mode 100755 a.out diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c24946d5f74d..f2bcff12acc5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470)) + - Added support to checkpoint after training steps in `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) + - Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072)) @@ -36,7 +38,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274)) -- Added teardown method to BaseProfiler to enable subclasses define post-profiling steps outside of __del__ method. + +- Added `teardown` method to `BaseProfiler` to enable subclasses defe post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370)) + - Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139)) @@ -121,6 +125,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added Autocast in validation, test and predict modes for Native AMP ([#6565](https://github.com/PyTorchLightning/pytorch-lightning/pull/6565)) + - Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011)) @@ -148,6 +153,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/PyTorchLightning/pytorch-lightning/pull/6416)) +- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587)) + + +- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576)) + + - Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/PyTorchLightning/pytorch-lightning/pull/6506)) @@ -171,12 +182,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541)) -- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587)) - - -- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576)) - - ## [1.2.3] - 2021-03-09 ### Fixed diff --git a/a.out b/a.out deleted file mode 100755 index 2b2f324763ddafba09fc04dd9c28501de6790233..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 49424 zcmeI)zi%657{Kw@NeY%4weEuWQG}rc0Yg&|2!vS_fq?s%vrvV^ya@&qeExHxaD)wOc}dpay&xw-GhBlTVFC%ty(T9}Yu z@4K+xd>FTUGN#tn0g*xH^LD2x?apR5&8PR{$FQF5xrOEVHT;PGYx{azo5+QwwIzA~ z@}*V3yA{6=**@lbyWgzYUfDe-a;-bDt%t5x)VXO?{O3dF%s;)4!U?PEfi_=PIBDH5 zSC{5#RD74!m0hz)T$ldNR-+rscG5d^z5Tj|({<@@YsycgSx@TzyXk$s2FwQI;sZ7d%@jSjxf6FRZ1&F=5TLBF>cw;PMiow%vF z#WUKhj_1B#zIW%&`IEQrf4#CE*Ee6+b7gJ*8nvpz@$@*gEY@dU)iJy8vUpjq-`27J z7n5P%`?fY4v-%6?=k;<64{OnSR(JYIv~~8qvszcXp_4RT(q`*#jJ)S7AvZ4q0R#|0 z009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{ z1Q0*~0R#|0009ILK;Y;EKCbQlQM>t9sdnp~x3r2M+yi@}^|fDjtN-rat=;^k^x4mQ z ztC1bNsmCXZ&J~CLeFHY_WTB*gef~t6JGT+uM{wGo-+wj1i@AJO^EqwSTa=v~P?MxJ UW%WODg(B%Gy}ax7Budx$AE8atB>(^b diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 56306bc154d28..cc95146932382 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -484,7 +484,6 @@ def fit( self.call_hook('on_fit_end') # teardown - self.profiler.teardown() self.call_teardown_hook(model) if self.state != TrainerState.INTERRUPTED: @@ -1078,6 +1077,7 @@ def call_teardown_hook(self, model: LightningModule) -> None: else: state = None + self.profiler.teardown() self.teardown(stage=state) model.teardown(stage=state) diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 5221c0cbf7bf6..63d2334d1d068 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -316,3 +316,21 @@ def test_pytorch_profiler_nested_emit_nvtx(tmpdir): gpus=1, ) trainer.fit(model) + + +@pytest.mark.parametrize("cls", (SimpleProfiler, AdvancedProfiler, PyTorchProfiler)) +def test_profiler_teardown(tmpdir, cls): + """ + This test checks if profiler teardown method is called when trainer is exiting. + """ + profiler = cls(output_filename=os.path.join(tmpdir, "profiler.txt")) + + model = BoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + profiler=profiler, + ) + trainer.fit(model) + + assert profiler.output_file.closed From f7c0cbc64a48232467637482942e0dbbe2fd6b1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 22 Mar 2021 01:58:08 +0100 Subject: [PATCH 7/8] Update CHANGELOG.md Co-authored-by: ananthsub --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2bcff12acc5b..5f005f583c5ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274)) -- Added `teardown` method to `BaseProfiler` to enable subclasses defe post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370)) +- Added `teardown` method to `BaseProfiler` to enable subclasses defining post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370)) - Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139)) From 14ab30b297263f44c6bdfcf8b4fc406c61756dcb Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 22 Mar 2021 08:32:18 +0000 Subject: [PATCH 8/8] resolve test --- .gitignore | 1 + pytorch_lightning/profiler/profilers.py | 6 ++---- pytorch_lightning/profiler/pytorch.py | 3 +-- pytorch_lightning/trainer/training_loop.py | 1 + tests/test_profiler.py | 4 ++-- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index cd0ba22453512..c007140257188 100644 --- a/.gitignore +++ b/.gitignore @@ -157,3 +157,4 @@ tags data MNIST runs +*traces* diff --git a/pytorch_lightning/profiler/profilers.py b/pytorch_lightning/profiler/profilers.py index b61e8dfa65f72..55898dc2ee4e1 100644 --- a/pytorch_lightning/profiler/profilers.py +++ b/pytorch_lightning/profiler/profilers.py @@ -215,8 +215,7 @@ def log_row(action, mean, total): def describe(self): """Logs a profile report after the conclusion of the training run.""" super().describe() - if self.output_file: - self.output_file.flush() + self.teardown() def teardown(self) -> None: """Close profiler's stream.""" @@ -290,8 +289,7 @@ def summary(self) -> str: def describe(self): """Logs a profile report after the conclusion of the training run.""" super().describe() - if self.output_file: - self.output_file.flush() + self.teardown() def teardown(self) -> None: """Close profiler's stream.""" diff --git a/pytorch_lightning/profiler/pytorch.py b/pytorch_lightning/profiler/pytorch.py index 71da6a67cae20..fdde80589acf3 100644 --- a/pytorch_lightning/profiler/pytorch.py +++ b/pytorch_lightning/profiler/pytorch.py @@ -294,8 +294,7 @@ def summary(self) -> str: def describe(self): """Logs a profile report after the conclusion of the training run.""" super().describe() - if self.output_file: - self.output_file.flush() + self.teardown() def teardown(self) -> None: """Close profiler's stream.""" diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 7e737c424ff26..a77d91a7402b4 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -140,6 +140,7 @@ def on_train_end(self): self.trainer.logger.finalize("success") # summarize profile results + # todo (tchaton) All ranks should call describe. if self.trainer.global_rank == 0: self.trainer.profiler.describe() diff --git a/tests/test_profiler.py b/tests/test_profiler.py index 63d2334d1d068..ccdd8a569c9a8 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -252,8 +252,8 @@ def test_pytorch_profiler_trainer_ddp(tmpdir, use_output_filename): assert profiler.summary() is None assert set(profiler.profiled_actions.keys()) == set() - if use_output_filename: - profiler.describe() + # todo (tchaton) add support for all ranks + if use_output_filename and os.getenv("LOCAL_RANK") == "0": data = Path(profiler.output_fname).read_text() assert len(data) > 0