From 3a0ec8a0cac23b2de04b5a9480ee72962d8740f1 Mon Sep 17 00:00:00 2001 From: Arun Babu Date: Thu, 21 Nov 2019 15:57:54 -0800 Subject: [PATCH] Fix broken gradients logging and add lr logging to tensorboard (#1158) Summary: Pull Request resolved: https://github.com/facebookresearch/pytext/pull/1158 This should help to monitor lr when using warmup/annealing etc Reviewed By: geof90 Differential Revision: D18624642 fbshipit-source-id: 53f3bbf73c285fb88cd81f260771e31c0083e4c9 --- pytext/metric_reporters/channel.py | 15 +++++++++------ pytext/metric_reporters/metric_reporter.py | 5 ++++- pytext/trainers/trainer.py | 8 +++++++- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/pytext/metric_reporters/channel.py b/pytext/metric_reporters/channel.py index f9b14469a..a4ac90096 100644 --- a/pytext/metric_reporters/channel.py +++ b/pytext/metric_reporters/channel.py @@ -170,6 +170,7 @@ def report( context, meta, model, + optimizer, *args, ): """ @@ -213,17 +214,19 @@ def report( self.add_scalars(prefix, metrics, epoch) if stage == Stage.TRAIN: + if optimizer is not None: + for idx, param_group in enumerate(optimizer.param_groups): + self.summary_writer.add_scalar( + f"optimizer.lr.param_group.{idx}", param_group["lr"], epoch + ) for key, val in model.named_parameters(): if val is not None and len(val) > 0 and not (val == 0).all(): limit = 9.9e19 + grad = val.grad val = torch.clamp(val.float(), -limit, limit) self.summary_writer.add_histogram(key, val, epoch) - if ( - val.grad is not None - and len(val.grad) > 0 - and not (val.grad == 0).all() - ): - grad = torch.clamp(val.grad.float(), -limit, limit) + if grad is not None and len(grad) > 0 and not (grad == 0).all(): + grad = torch.clamp(grad.float(), -limit, limit) self.summary_writer.add_histogram( key + "_gradients", grad, epoch ) diff --git a/pytext/metric_reporters/metric_reporter.py b/pytext/metric_reporters/metric_reporter.py index f9798d1a2..b8fe875be 100644 --- a/pytext/metric_reporters/metric_reporter.py +++ b/pytext/metric_reporters/metric_reporter.py @@ -206,7 +206,9 @@ def get_meta(self): """ return {} - def report_metric(self, model, stage, epoch, reset=True, print_to_channels=True): + def report_metric( + self, model, stage, epoch, reset=True, print_to_channels=True, optimizer=None + ): """ Calculate metrics and average loss, report all statistic data to channels @@ -241,6 +243,7 @@ def report_metric(self, model, stage, epoch, reset=True, print_to_channels=True) self.all_context, self.get_meta(), model, + optimizer, ) if reset: diff --git a/pytext/trainers/trainer.py b/pytext/trainers/trainer.py index 4d4a9f063..742b86f64 100644 --- a/pytext/trainers/trainer.py +++ b/pytext/trainers/trainer.py @@ -509,7 +509,13 @@ def run_epoch( if report_metric: with timing.time("report metrics"): metrics = metric_reporter.report_metric( - model, state.stage, state.epoch, print_to_channels=(state.rank == 0) + model, + state.stage, + state.epoch, + print_to_channels=(state.rank == 0), + optimizer=getattr( + state, "optimizer", None + ), # optimizer is not present during test ) else: metric_reporter._reset()