From 4e4d3e3a5b27ba12dddd23a1ee4f3eccf22de56f Mon Sep 17 00:00:00 2001 From: glennjocher Date: Sun, 21 Aug 2022 16:23:02 +0200 Subject: [PATCH 1/2] Log TensorBoard graph on pretrain_routine_end --- train.py | 6 +++--- utils/loggers/__init__.py | 33 +++++++++++++++++---------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/train.py b/train.py index ac38d04dba90..af04b11c5b9e 100644 --- a/train.py +++ b/train.py @@ -219,7 +219,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # run AutoAnchor model.half().float() # pre-reduce anchor precision - callbacks.run('on_pretrain_routine_end', labels, names, plots) + callbacks.run('on_pretrain_routine_end', model, labels, names) # DDP mode if cuda and RANK != -1: @@ -328,7 +328,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%11s' * 2 + '%11.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) - callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots) + callbacks.run('on_train_batch_end', ni, imgs, targets, paths) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ @@ -420,7 +420,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) - callbacks.run('on_train_end', last, best, plots, epoch, results) + callbacks.run('on_train_end', last, best, epoch, results) torch.cuda.empty_cache() return results diff --git a/utils/loggers/__init__.py b/utils/loggers/__init__.py index b9869df26a43..9826b439ae15 100644 --- a/utils/loggers/__init__.py +++ b/utils/loggers/__init__.py @@ -11,7 +11,7 @@ import torch from torch.utils.tensorboard import SummaryWriter -from utils.general import colorstr, cv2, threaded +from utils.general import colorstr, cv2 from utils.loggers.clearml.clearml_utils import ClearmlLogger from utils.loggers.wandb.wandb_utils import WandbLogger from utils.plots import plot_images, plot_labels, plot_results @@ -49,6 +49,7 @@ def __init__(self, save_dir=None, weights=None, opt=None, hyp=None, logger=None, self.weights = weights self.opt = opt self.hyp = hyp + self.plots = not opt.noplots # plot results self.logger = logger # for printing results to console self.include = include self.keys = [ @@ -110,22 +111,22 @@ def on_train_start(self): # Callback runs on train start pass - def on_pretrain_routine_end(self, labels, names, plots): + def on_pretrain_routine_end(self, model, labels, names): # Callback runs on pre-train routine end - if plots: + if self.plots: plot_labels(labels, names, self.save_dir) - paths = self.save_dir.glob('*labels*.jpg') # training labels - if self.wandb: - self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]}) - # if self.clearml: - # pass # ClearML saves these images automatically using hooks + if self.tb and not self.opt.sync_bn: + log_tensorboard_graph(self.tb, model, imgsz=(640, 640)) + paths = self.save_dir.glob('*labels*.jpg') # training labels + if self.wandb: + self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]}) + # if self.clearml: + # pass # ClearML saves these images automatically using hooks - def on_train_batch_end(self, ni, model, imgs, targets, paths, plots): + def on_train_batch_end(self, ni, imgs, targets, paths): # Callback runs on train batch end # ni: number integrated batches (since train start) - if plots: - if ni == 0 and not self.opt.sync_bn and self.tb: - log_tensorboard_graph(self.tb, model, imgsz=list(imgs.shape[2:4])) + if self.plots: if ni < 3: f = self.save_dir / f'train_batch{ni}.jpg' # filename plot_images(imgs, targets, paths, f) @@ -197,9 +198,9 @@ def on_model_save(self, last, epoch, final_epoch, best_fitness, fi): model_name='Latest Model', auto_delete_file=False) - def on_train_end(self, last, best, plots, epoch, results): + def on_train_end(self, last, best, epoch, results): # Callback runs on training end, i.e. saving best model - if plots: + if self.plots: plot_results(file=self.save_dir / 'results.csv') # save results.png files = ['results.png', 'confusion_matrix.png', *(f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R'))] files = [(self.save_dir / f) for f in files if (self.save_dir / f).exists()] # filter @@ -300,5 +301,5 @@ def log_tensorboard_graph(tb, model, imgsz=(640, 640)): with warnings.catch_warnings(): warnings.simplefilter('ignore') # suppress jit trace warning tb.add_graph(torch.jit.trace(de_parallel(model), im, strict=False), []) - except Exception: - print('WARNING: TensorBoard graph visualization failure') + except Exception as e: + print(f'WARNING: TensorBoard graph visualization failure {e}') From 66c86a69568b75f7e8fab9d8cca3750ea6fc909e Mon Sep 17 00:00:00 2001 From: glennjocher Date: Sun, 21 Aug 2022 16:47:11 +0200 Subject: [PATCH 2/2] fix --- train.py | 4 ++-- utils/loggers/__init__.py | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/train.py b/train.py index af04b11c5b9e..e4c9b6ae6749 100644 --- a/train.py +++ b/train.py @@ -219,7 +219,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # run AutoAnchor model.half().float() # pre-reduce anchor precision - callbacks.run('on_pretrain_routine_end', model, labels, names) + callbacks.run('on_pretrain_routine_end', labels, names) # DDP mode if cuda and RANK != -1: @@ -328,7 +328,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%11s' * 2 + '%11.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) - callbacks.run('on_train_batch_end', ni, imgs, targets, paths) + callbacks.run('on_train_batch_end', model, ni, imgs, targets, paths) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ diff --git a/utils/loggers/__init__.py b/utils/loggers/__init__.py index 9826b439ae15..98a123eee74d 100644 --- a/utils/loggers/__init__.py +++ b/utils/loggers/__init__.py @@ -11,7 +11,7 @@ import torch from torch.utils.tensorboard import SummaryWriter -from utils.general import colorstr, cv2 +from utils.general import colorstr, cv2, threaded from utils.loggers.clearml.clearml_utils import ClearmlLogger from utils.loggers.wandb.wandb_utils import WandbLogger from utils.plots import plot_images, plot_labels, plot_results @@ -111,26 +111,26 @@ def on_train_start(self): # Callback runs on train start pass - def on_pretrain_routine_end(self, model, labels, names): + def on_pretrain_routine_end(self, labels, names): # Callback runs on pre-train routine end if self.plots: plot_labels(labels, names, self.save_dir) - if self.tb and not self.opt.sync_bn: - log_tensorboard_graph(self.tb, model, imgsz=(640, 640)) paths = self.save_dir.glob('*labels*.jpg') # training labels if self.wandb: self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]}) # if self.clearml: # pass # ClearML saves these images automatically using hooks - def on_train_batch_end(self, ni, imgs, targets, paths): + def on_train_batch_end(self, model, ni, imgs, targets, paths): # Callback runs on train batch end # ni: number integrated batches (since train start) if self.plots: if ni < 3: f = self.save_dir / f'train_batch{ni}.jpg' # filename plot_images(imgs, targets, paths, f) - if (self.wandb or self.clearml) and ni == 10: + if ni == 0 and self.tb and not self.opt.sync_bn: + log_tensorboard_graph(self.tb, model, imgsz=(self.opt.imgsz, self.opt.imgsz)) + if ni == 10 and (self.wandb or self.clearml): files = sorted(self.save_dir.glob('train*.jpg')) if self.wandb: self.wandb.log({'Mosaics': [wandb.Image(str(f), caption=f.name) for f in files if f.exists()]}) @@ -292,6 +292,7 @@ def log_model(self, model_path, epoch=0, metadata={}): wandb.log_artifact(art) +@threaded def log_tensorboard_graph(tb, model, imgsz=(640, 640)): # Log model graph to TensorBoard try: