From d25c11b244746dd14d3ae4236259ba09b1cb4cef Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Tue, 14 Jun 2022 21:56:50 +0530 Subject: [PATCH 01/25] attempt at reproducibility --- train.py | 9 +++++---- utils/dataloaders.py | 7 +++++-- utils/general.py | 7 +++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index a06ad5a418f8..cfec29c629c1 100644 --- a/train.py +++ b/train.py @@ -47,7 +47,7 @@ from utils.downloads import attempt_download from utils.general import (LOGGER, check_amp, check_dataset, check_file, check_git_status, check_img_size, check_requirements, check_suffix, check_version, check_yaml, colorstr, get_latest_run, - increment_path, init_seeds, intersect_dicts, labels_to_class_weights, + increment_path, init_seeds, dataloader_init_fn, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer) from utils.loggers import Loggers from utils.loggers.wandb.wandb_utils import check_wandb_resume @@ -59,7 +59,7 @@ LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) - +SEED = 1 + RANK def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ @@ -101,7 +101,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' - init_seeds(1 + RANK) + init_seeds(SEED) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] @@ -232,7 +232,8 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), - shuffle=True) + shuffle=True, + worker_init_fn=dataloader_init_fn) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' diff --git a/utils/dataloaders.py b/utils/dataloaders.py index 8f32c301b772..46d831bd6fcf 100755 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -106,7 +106,9 @@ def create_dataloader(path, image_weights=False, quad=False, prefix='', - shuffle=False): + shuffle=False, + worker_init_fn=None + ): if rect and shuffle: LOGGER.warning('WARNING: --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False @@ -136,7 +138,8 @@ def create_dataloader(path, num_workers=nw, sampler=sampler, pin_memory=True, - collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn), dataset + collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn, + worker_init_fn=worker_init_fn), dataset class InfiniteDataLoader(dataloader.DataLoader): diff --git a/utils/general.py b/utils/general.py index 9fc7349dbb6d..18d70df65d4b 100755 --- a/utils/general.py +++ b/utils/general.py @@ -199,11 +199,18 @@ def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn + os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) + # https://pytorch.org/docs/stable/_modules/torch/cuda/random.html#manual_seed + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) +def dataloader_init_fn(worker_id): + rank = int(os.getenv('RANK', -1)) # rank in world for Multi-GPU trainings + np.random.seed(rank+1) def intersect_dicts(da, db, exclude=()): # Dictionary intersection of matching keys and shapes, omitting 'exclude' keys, using da values From ebe59e7b398ad873511bdbbe659105d8c1103771 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 Jun 2022 16:29:24 +0000 Subject: [PATCH 02/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- train.py | 5 +++-- utils/dataloaders.py | 3 +-- utils/general.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/train.py b/train.py index cfec29c629c1..12b2d815d7cd 100644 --- a/train.py +++ b/train.py @@ -46,8 +46,8 @@ from utils.dataloaders import create_dataloader from utils.downloads import attempt_download from utils.general import (LOGGER, check_amp, check_dataset, check_file, check_git_status, check_img_size, - check_requirements, check_suffix, check_version, check_yaml, colorstr, get_latest_run, - increment_path, init_seeds, dataloader_init_fn, intersect_dicts, labels_to_class_weights, + check_requirements, check_suffix, check_version, check_yaml, colorstr, dataloader_init_fn, + get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer) from utils.loggers import Loggers from utils.loggers.wandb.wandb_utils import check_wandb_resume @@ -61,6 +61,7 @@ WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) SEED = 1 + RANK + def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ diff --git a/utils/dataloaders.py b/utils/dataloaders.py index 46d831bd6fcf..6545656bd3c3 100755 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -107,8 +107,7 @@ def create_dataloader(path, quad=False, prefix='', shuffle=False, - worker_init_fn=None - ): + worker_init_fn=None): if rect and shuffle: LOGGER.warning('WARNING: --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False diff --git a/utils/general.py b/utils/general.py index 18d70df65d4b..87244d089843 100755 --- a/utils/general.py +++ b/utils/general.py @@ -205,12 +205,14 @@ def init_seeds(seed=0): torch.manual_seed(seed) # https://pytorch.org/docs/stable/_modules/torch/cuda/random.html#manual_seed torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe + torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) + def dataloader_init_fn(worker_id): rank = int(os.getenv('RANK', -1)) # rank in world for Multi-GPU trainings - np.random.seed(rank+1) + np.random.seed(rank + 1) + def intersect_dicts(da, db, exclude=()): # Dictionary intersection of matching keys and shapes, omitting 'exclude' keys, using da values From 254d37943b00a7e1b70611bbb3b38326d4dcba6e Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Wed, 15 Jun 2022 20:45:30 +0530 Subject: [PATCH 03/25] use deterministic algs --- utils/general.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/general.py b/utils/general.py index 87244d089843..8f7514f8301d 100755 --- a/utils/general.py +++ b/utils/general.py @@ -199,7 +199,9 @@ def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn + torch.use_deterministic_algorithms(True, warn_only=True) os.environ['PYTHONHASHSEED'] = str(seed) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) From 85f4e1660286c1c081abb2e9eba38f58254c4fcc Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Sat, 18 Jun 2022 02:17:39 +0530 Subject: [PATCH 04/25] fix everything :) --- train.py | 5 ++--- utils/general.py | 5 +---- val.py | 4 +++- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/train.py b/train.py index 12b2d815d7cd..20bf1feaeb05 100644 --- a/train.py +++ b/train.py @@ -233,8 +233,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), - shuffle=True, - worker_init_fn=dataloader_init_fn) + shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' @@ -360,7 +359,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Backward scaler.scale(loss).backward() - + # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step diff --git a/utils/general.py b/utils/general.py index 8f7514f8301d..fbcf88045c03 100755 --- a/utils/general.py +++ b/utils/general.py @@ -198,17 +198,14 @@ def print_args(args: Optional[dict] = None, show_file=True, show_fcn=False): def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible - import torch.backends.cudnn as cudnn - torch.use_deterministic_algorithms(True, warn_only=True) + torch.use_deterministic_algorithms(True) os.environ['PYTHONHASHSEED'] = str(seed) os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - # https://pytorch.org/docs/stable/_modules/torch/cuda/random.html#manual_seed torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe - cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) def dataloader_init_fn(worker_id): diff --git a/val.py b/val.py index dc7f28f46882..ae3e5aaf902b 100644 --- a/val.py +++ b/val.py @@ -89,7 +89,9 @@ def process_batch(detections, labels, iouv): matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + torch.use_deterministic_algorithms(False) correct[matches[:, 1].astype(int), i] = True + torch.use_deterministic_algorithms(True) return correct @@ -259,7 +261,7 @@ def run( plot_images(im, output_to_target(out), paths, save_dir / f'val_batch{batch_i}_pred.jpg', names) # pred callbacks.run('on_val_batch_end') - + # Compute metrics stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): From 942014f7b7a746cdc017c590089878b50840f01d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Jun 2022 20:48:22 +0000 Subject: [PATCH 05/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- train.py | 2 +- val.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index 20bf1feaeb05..885a90d09862 100644 --- a/train.py +++ b/train.py @@ -359,7 +359,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Backward scaler.scale(loss).backward() - + # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step diff --git a/val.py b/val.py index ae3e5aaf902b..6a1a238534eb 100644 --- a/val.py +++ b/val.py @@ -261,7 +261,7 @@ def run( plot_images(im, output_to_target(out), paths, save_dir / f'val_batch{batch_i}_pred.jpg', names) # pred callbacks.run('on_val_batch_end') - + # Compute metrics stats = [torch.cat(x, 0).cpu().numpy() for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): From d764a75c80d8a975c6c0bdaf1c986b4f06da0167 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Sat, 18 Jun 2022 02:32:40 +0530 Subject: [PATCH 06/25] revert dataloader changes --- train.py | 6 +++--- utils/dataloaders.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/train.py b/train.py index 20bf1feaeb05..336ce5518060 100644 --- a/train.py +++ b/train.py @@ -46,8 +46,8 @@ from utils.dataloaders import create_dataloader from utils.downloads import attempt_download from utils.general import (LOGGER, check_amp, check_dataset, check_file, check_git_status, check_img_size, - check_requirements, check_suffix, check_version, check_yaml, colorstr, dataloader_init_fn, - get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, + check_requirements, check_suffix, check_version, check_yaml, colorstr,get_latest_run, + increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer) from utils.loggers import Loggers from utils.loggers.wandb.wandb_utils import check_wandb_resume @@ -359,7 +359,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Backward scaler.scale(loss).backward() - + # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step diff --git a/utils/dataloaders.py b/utils/dataloaders.py index 6545656bd3c3..8f32c301b772 100755 --- a/utils/dataloaders.py +++ b/utils/dataloaders.py @@ -106,8 +106,7 @@ def create_dataloader(path, image_weights=False, quad=False, prefix='', - shuffle=False, - worker_init_fn=None): + shuffle=False): if rect and shuffle: LOGGER.warning('WARNING: --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False @@ -137,8 +136,7 @@ def create_dataloader(path, num_workers=nw, sampler=sampler, pin_memory=True, - collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn, - worker_init_fn=worker_init_fn), dataset + collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn), dataset class InfiniteDataLoader(dataloader.DataLoader): From 47da108de9d8bf2226ba2e6225d76e477368ac60 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Jun 2022 21:03:18 +0000 Subject: [PATCH 07/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 336ce5518060..fb645538ccfd 100644 --- a/train.py +++ b/train.py @@ -46,7 +46,7 @@ from utils.dataloaders import create_dataloader from utils.downloads import attempt_download from utils.general import (LOGGER, check_amp, check_dataset, check_file, check_git_status, check_img_size, - check_requirements, check_suffix, check_version, check_yaml, colorstr,get_latest_run, + check_requirements, check_suffix, check_version, check_yaml, colorstr, get_latest_run, increment_path, init_seeds, intersect_dicts, labels_to_class_weights, labels_to_image_weights, methods, one_cycle, print_args, print_mutation, strip_optimizer) from utils.loggers import Loggers From 695b665e3208fba3aff1fc122ce7f59c2dd1b72e Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 18 Jun 2022 13:35:04 +0200 Subject: [PATCH 08/25] process_batch as np --- val.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/val.py b/val.py index 6a1a238534eb..b0d49c3767a5 100644 --- a/val.py +++ b/val.py @@ -77,7 +77,7 @@ def process_batch(detections, labels, iouv): Returns: correct (Array[N, 10]), for 10 IoU levels """ - correct = torch.zeros(detections.shape[0], iouv.shape[0], dtype=torch.bool, device=iouv.device) + correct = np.zeros((detections.shape[0], iouv.shape[0])).astype(bool) iou = box_iou(labels[:, 1:], detections[:, :4]) correct_class = labels[:, 0:1] == detections[:, 5] for i in range(len(iouv)): @@ -89,10 +89,9 @@ def process_batch(detections, labels, iouv): matches = matches[np.unique(matches[:, 1], return_index=True)[1]] # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] - torch.use_deterministic_algorithms(False) correct[matches[:, 1].astype(int), i] = True - torch.use_deterministic_algorithms(True) - return correct + + return torch.tensor(correct, dtype=torch.bool, device=iouv.device) @torch.no_grad() From 03801ebac5fdb58452b63e398b3f5a878b4f3bfa Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 18 Jun 2022 13:35:43 +0200 Subject: [PATCH 09/25] remove newline --- val.py | 1 - 1 file changed, 1 deletion(-) diff --git a/val.py b/val.py index b0d49c3767a5..f4f4bab7e92d 100644 --- a/val.py +++ b/val.py @@ -90,7 +90,6 @@ def process_batch(detections, labels, iouv): # matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] correct[matches[:, 1].astype(int), i] = True - return torch.tensor(correct, dtype=torch.bool, device=iouv.device) From 4cdecfec2604fe8b143f9ab73b78255e5f0858c9 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 18 Jun 2022 13:52:25 +0200 Subject: [PATCH 10/25] Remove dataloader init fcn --- utils/general.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/utils/general.py b/utils/general.py index fbcf88045c03..c3eca074960c 100755 --- a/utils/general.py +++ b/utils/general.py @@ -208,11 +208,6 @@ def init_seeds(seed=0): torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe -def dataloader_init_fn(worker_id): - rank = int(os.getenv('RANK', -1)) # rank in world for Multi-GPU trainings - np.random.seed(rank + 1) - - def intersect_dicts(da, db, exclude=()): # Dictionary intersection of matching keys and shapes, omitting 'exclude' keys, using da values return {k: v for k, v in da.items() if k in db and not any(x in k for x in exclude) and v.shape == db[k].shape} From 4a9dfbf027fee4380c5387774669d571149bb113 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 18 Jun 2022 15:56:52 +0200 Subject: [PATCH 11/25] Update val.py --- val.py | 1 + 1 file changed, 1 insertion(+) diff --git a/val.py b/val.py index f4f4bab7e92d..2e8b1a1e3b10 100644 --- a/val.py +++ b/val.py @@ -124,6 +124,7 @@ def run( compute_loss=None, ): # Initialize/load model and set device + torch.use_deterministic_algorithms(False) training = model is not None if training: # called by train.py device, pt, jit, engine = next(model.parameters()).device, True, False, False # get model device, PyTorch model From b4f5f3d4fc4172dcf1f2bf36dfcb9d2f7f808e75 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sat, 18 Jun 2022 15:59:07 +0200 Subject: [PATCH 12/25] Update train.py --- train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/train.py b/train.py index fb645538ccfd..3e12d3a82d6a 100644 --- a/train.py +++ b/train.py @@ -324,6 +324,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio if RANK in {-1, 0}: pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() + torch.use_deterministic_algorithms(True) for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) From 82af0156a2cfc729e63ce3a362c6eba5b884eb5f Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 19 Jun 2022 18:13:51 +0200 Subject: [PATCH 13/25] revert additional changes --- utils/general.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/utils/general.py b/utils/general.py index c3eca074960c..02465486725d 100755 --- a/utils/general.py +++ b/utils/general.py @@ -194,18 +194,21 @@ def print_args(args: Optional[dict] = None, show_file=True, show_fcn=False): s = (f'{Path(file).stem}: ' if show_file else '') + (f'{fcn}: ' if show_fcn else '') LOGGER.info(colorstr(s) + ', '.join(f'{k}={v}' for k, v in args.items())) - + def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible + import torch.backends.cudnn as cudnn torch.use_deterministic_algorithms(True) - os.environ['PYTHONHASHSEED'] = str(seed) os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) # for multi GPU. Exception safe + cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) + # os.environ['PYTHONHASHSEED'] = str(seed) + # torch.cuda.manual_seed(seed) + # torch.cuda.manual_seed_all(seed) # for multi GPU, exception safe + def intersect_dicts(da, db, exclude=()): From 9220b5a8b68b502e75ab114f4a03ce36a839f261 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Jun 2022 16:14:11 +0000 Subject: [PATCH 14/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/general.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/general.py b/utils/general.py index 02465486725d..1db439465ef3 100755 --- a/utils/general.py +++ b/utils/general.py @@ -194,7 +194,7 @@ def print_args(args: Optional[dict] = None, show_file=True, show_fcn=False): s = (f'{Path(file).stem}: ' if show_file else '') + (f'{fcn}: ' if show_fcn else '') LOGGER.info(colorstr(s) + ', '.join(f'{k}={v}' for k, v in args.items())) - + def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible @@ -208,7 +208,6 @@ def init_seeds(seed=0): # os.environ['PYTHONHASHSEED'] = str(seed) # torch.cuda.manual_seed(seed) # torch.cuda.manual_seed_all(seed) # for multi GPU, exception safe - def intersect_dicts(da, db, exclude=()): From 5366a14bb659c632c8fbcb8061063cddce256994 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 19 Jun 2022 18:15:56 +0200 Subject: [PATCH 15/25] Update train.py --- train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/train.py b/train.py index 3e12d3a82d6a..94686cfa35d9 100644 --- a/train.py +++ b/train.py @@ -59,7 +59,6 @@ LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) -SEED = 1 + RANK def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary @@ -102,7 +101,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' - init_seeds(SEED) + init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] From f798d8e8d7edb3dabfeaf1f0698deb680be6a781 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Sun, 19 Jun 2022 19:35:48 +0200 Subject: [PATCH 16/25] Add --seed arg --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index 94686cfa35d9..e3c153b9b775 100644 --- a/train.py +++ b/train.py @@ -101,7 +101,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' - init_seeds(1 + RANK) + init_seeds(opt.seed + 1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] @@ -509,6 +509,7 @@ def parse_opt(known=False): parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)') parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2') parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)') + parser.add_argument('--seed', type=int, default=0, help='Global training seed') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') # Weights & Biases arguments From 1eb9fbab98e9f479861f44287759cb95e6cbde1c Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Mon, 27 Jun 2022 21:27:48 +0200 Subject: [PATCH 17/25] Update general.py --- utils/general.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/general.py b/utils/general.py index ab170181d4aa..ff600b13db72 100755 --- a/utils/general.py +++ b/utils/general.py @@ -199,8 +199,8 @@ def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn - torch.use_deterministic_algorithms(True) - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + # torch.use_deterministic_algorithms(True) + # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) From 6dc813f4d4f2720b0a8170bab1609cbeb55ebad2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Jun 2022 19:28:12 +0000 Subject: [PATCH 18/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/general.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/general.py b/utils/general.py index ff600b13db72..379134eb608f 100755 --- a/utils/general.py +++ b/utils/general.py @@ -199,6 +199,7 @@ def init_seeds(seed=0): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn + # torch.use_deterministic_algorithms(True) # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' random.seed(seed) From 7c6620e6e255f5495a399cf58135c7c58ac3f31c Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Mon, 27 Jun 2022 21:28:28 +0200 Subject: [PATCH 19/25] Update train.py --- train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/train.py b/train.py index 64ca31d204c7..2738e2cd0d34 100644 --- a/train.py +++ b/train.py @@ -324,6 +324,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() torch.use_deterministic_algorithms(True) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) From 6d272fbd0e2c8fde7c3f556170b34880b851bc0f Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Mon, 27 Jun 2022 21:34:20 +0200 Subject: [PATCH 20/25] Update train.py --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 2738e2cd0d34..0fe54ac9aa82 100644 --- a/train.py +++ b/train.py @@ -59,6 +59,7 @@ LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) +os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # for torch.use_deterministic_algorithms(True) def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary @@ -324,7 +325,6 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() torch.use_deterministic_algorithms(True) - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) From dfd6306b6874dc15cd8cd18681a2c73a6a7dddf6 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 29 Jun 2022 16:09:47 +0200 Subject: [PATCH 21/25] Update val.py --- val.py | 1 - 1 file changed, 1 deletion(-) diff --git a/val.py b/val.py index 2e8b1a1e3b10..f4f4bab7e92d 100644 --- a/val.py +++ b/val.py @@ -124,7 +124,6 @@ def run( compute_loss=None, ): # Initialize/load model and set device - torch.use_deterministic_algorithms(False) training = model is not None if training: # called by train.py device, pt, jit, engine = next(model.parameters()).device, True, False, False # get model device, PyTorch model From cfc1079c662ef9c08372e0240e79fa054d6bf5e9 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 29 Jun 2022 16:10:28 +0200 Subject: [PATCH 22/25] Update train.py --- train.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/train.py b/train.py index 0fe54ac9aa82..3932cfdda631 100644 --- a/train.py +++ b/train.py @@ -59,7 +59,6 @@ LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html RANK = int(os.getenv('RANK', -1)) WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1)) -os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # for torch.use_deterministic_algorithms(True) def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictionary @@ -324,7 +323,6 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio if RANK in {-1, 0}: pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() - torch.use_deterministic_algorithms(True) for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- callbacks.run('on_train_batch_start') ni = i + nb * epoch # number integrated batches (since train start) From c59d666f71cee3e5dacf7c2ec8fa23e9708662ad Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 29 Jun 2022 16:12:06 +0200 Subject: [PATCH 23/25] Update general.py --- utils/general.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/general.py b/utils/general.py index 379134eb608f..cdfb73d4f657 100755 --- a/utils/general.py +++ b/utils/general.py @@ -200,13 +200,14 @@ def init_seeds(seed=0): # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn - # torch.use_deterministic_algorithms(True) - # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + torch.use_deterministic_algorithms(True) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # for torch.use_deterministic_algorithms(True) + # os.environ['PYTHONHASHSEED'] = str(seed) + random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) cudnn.benchmark, cudnn.deterministic = (False, True) if seed == 0 else (True, False) - # os.environ['PYTHONHASHSEED'] = str(seed) # torch.cuda.manual_seed(seed) # torch.cuda.manual_seed_all(seed) # for multi GPU, exception safe From d5cded43be097866926ee839fa4b4cc0ed308b68 Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Wed, 29 Jun 2022 16:14:58 +0200 Subject: [PATCH 24/25] Update general.py --- utils/general.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/utils/general.py b/utils/general.py index cdfb73d4f657..ebe858b599d1 100755 --- a/utils/general.py +++ b/utils/general.py @@ -200,9 +200,10 @@ def init_seeds(seed=0): # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn - torch.use_deterministic_algorithms(True) - os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # for torch.use_deterministic_algorithms(True) - # os.environ['PYTHONHASHSEED'] = str(seed) + if check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 + torch.use_deterministic_algorithms(True) + os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' + # os.environ['PYTHONHASHSEED'] = str(seed) random.seed(seed) np.random.seed(seed) From 76e061eb5f45b8e9f0a372d62db3c9e4024cc0ca Mon Sep 17 00:00:00 2001 From: Glenn Jocher Date: Thu, 7 Jul 2022 14:15:20 +0200 Subject: [PATCH 25/25] Add deterministic argument to init_seeds() --- train.py | 2 +- utils/general.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/train.py b/train.py index b4e11aca4bf0..bf5b4c69d74c 100644 --- a/train.py +++ b/train.py @@ -101,7 +101,7 @@ def train(hyp, opt, device, callbacks): # hyp is path/to/hyp.yaml or hyp dictio # Config plots = not evolve and not opt.noplots # create plots cuda = device.type != 'cpu' - init_seeds(opt.seed + 1 + RANK) + init_seeds(opt.seed + 1 + RANK, deterministic=True) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] diff --git a/utils/general.py b/utils/general.py index ebe858b599d1..17b689010b39 100755 --- a/utils/general.py +++ b/utils/general.py @@ -195,12 +195,12 @@ def print_args(args: Optional[dict] = None, show_file=True, show_fcn=False): LOGGER.info(colorstr(s) + ', '.join(f'{k}={v}' for k, v in args.items())) -def init_seeds(seed=0): +def init_seeds(seed=0, deterministic=False): # Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html # cudnn seed 0 settings are slower and more reproducible, else faster and less reproducible import torch.backends.cudnn as cudnn - if check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 + if deterministic and check_version(torch.__version__, '1.12.0'): # https://github.com/ultralytics/yolov5/pull/8213 torch.use_deterministic_algorithms(True) os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' # os.environ['PYTHONHASHSEED'] = str(seed)