diff --git a/environment.yml b/environment.yml index 3e7434685..d0b9a86c4 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - pyvips - imagemagick>=7.1.0 - pyarrow - - conda-forge::pytorch-lightning>=1.7.0 + - conda-forge::pytorch-lightning>=1.9.0,<2.0 - conda-forge::torchmetrics>=0.10.0 - pip - albumentations diff --git a/environment_cuda.yml b/environment_cuda.yml index fd719f72c..b28c9dcda 100644 --- a/environment_cuda.yml +++ b/environment_cuda.yml @@ -22,7 +22,7 @@ dependencies: - pyvips - imagemagick>=7.1.0 - pyarrow - - conda-forge::pytorch-lightning>=1.7.0 + - conda-forge::pytorch-lightning>=1.9.0,<2.0 - conda-forge::torchmetrics>=0.10.0 - pip - albumentations diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py index 4fcae7207..679b769a2 100644 --- a/kraken/ketos/pretrain.py +++ b/kraken/ketos/pretrain.py @@ -77,6 +77,11 @@ type=click.FLOAT, help='Minimum improvement between epochs to reset early stopping. Default is scales the delta by the best loss') @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)') +@click.option('--precision', + show_default=True, + default='16', + type=click.Choice(['64', '32', 'bf16', '16']), + help='Numerical precision to use for training. Default is 16-bit mixed precision.') @click.option('--optimizer', show_default=True, default=RECOGNITION_PRETRAIN_HYPER_PARAMS['optimizer'], @@ -176,7 +181,7 @@ help='Multiplicative factor for the logits used in contrastive loss.') @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False)) def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs, - min_epochs, lag, min_delta, device, optimizer, lrate, momentum, + min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum, weight_decay, warmup, schedule, gamma, step_size, sched_patience, cos_max, partition, fixed_splits, training_files, evaluation_files, workers, load_hyper_parameters, repolygonize, @@ -271,6 +276,7 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs, trainer = KrakenTrainer(accelerator=accelerator, devices=device, + precision=precision, max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1, min_epochs=hyper_params['min_epochs'], enable_progress_bar=True if not ctx.meta['verbose'] else False, diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py index 7cfae5ca7..b1dc4f8d3 100644 --- a/kraken/ketos/recognition.py +++ b/kraken/ketos/recognition.py @@ -76,6 +76,11 @@ type=click.FLOAT, help='Minimum improvement between epochs to reset early stopping. Default is scales the delta by the best loss') @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)') +@click.option('--precision', + show_default=True, + default='16', + type=click.Choice(['64', '32', 'bf16', '16']), + help='Numerical precision to use for training. Default is 16-bit mixed precision.') @click.option('--optimizer', show_default=True, default=RECOGNITION_HYPER_PARAMS['optimizer'], @@ -181,7 +186,7 @@ help='Path to directory where the logger will store the logs. If not set, a directory will be created in the current working directory.') @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False)) def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, - min_epochs, lag, min_delta, device, optimizer, lrate, momentum, + min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum, weight_decay, warmup, freeze_backbone, schedule, gamma, step_size, sched_patience, cos_max, partition, fixed_splits, normalization, normalize_whitespace, codec, resize, reorder, base_dir, @@ -293,6 +298,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs, trainer = KrakenTrainer(accelerator=accelerator, devices=device, + precision=precision, max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1, min_epochs=hyper_params['min_epochs'], freeze_backbone=hyper_params['freeze_backbone'], diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py index 6b5f9f259..f01673e49 100644 --- a/kraken/ketos/segmentation.py +++ b/kraken/ketos/segmentation.py @@ -98,7 +98,11 @@ def _validate_merging(ctx, param, value): type=click.FLOAT, help='Minimum improvement between epochs to reset early stopping. By default it scales the delta by the best loss') @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)') -@click.option('--precision', default='32', type=click.Choice(['32', '16']), help='set tensor precision') +@click.option('--precision', + show_default=True, + default='16', + type=click.Choice(['64', '32', 'bf16', '16']), + help='Numerical precision to use for training. Default is 16-bit mixed precision.') @click.option('--optimizer', show_default=True, default=SEGMENTATION_HYPER_PARAMS['optimizer'], @@ -332,6 +336,7 @@ def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs, trainer = KrakenTrainer(accelerator=accelerator, devices=device, + precision=precision, max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1, min_epochs=hyper_params['min_epochs'], enable_progress_bar=True if not ctx.meta['verbose'] else False,