Merge pull request #453 from colibrisson/autocast

add --precision option to ketos train and ketos segtrain
mittagessen · Feb 23, 2023 · 50d7860 · 50d7860
2 parents d6794a7 + 3dccaa0
commit 50d7860
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 5 deletions.
diff --git a/environment.yml b/environment.yml
@@ -21,7 +21,7 @@ dependencies:
   - pyvips
   - imagemagick>=7.1.0
   - pyarrow
-  - conda-forge::pytorch-lightning>=1.7.0
+  - conda-forge::pytorch-lightning>=1.9.0,<2.0
   - conda-forge::torchmetrics>=0.10.0
   - pip
   - albumentations

diff --git a/environment_cuda.yml b/environment_cuda.yml
@@ -22,7 +22,7 @@ dependencies:
   - pyvips
   - imagemagick>=7.1.0
   - pyarrow
-  - conda-forge::pytorch-lightning>=1.7.0
+  - conda-forge::pytorch-lightning>=1.9.0,<2.0
   - conda-forge::torchmetrics>=0.10.0
   - pip
   - albumentations

diff --git a/kraken/ketos/pretrain.py b/kraken/ketos/pretrain.py
@@ -77,6 +77,11 @@
               type=click.FLOAT,
               help='Minimum improvement between epochs to reset early stopping. Default is scales the delta by the best loss')
 @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)')
+@click.option('--precision',
+                show_default=True, 
+                default='16', 
+                type=click.Choice(['64', '32', 'bf16', '16']),
+                help='Numerical precision to use for training. Default is 16-bit mixed precision.')
 @click.option('--optimizer',
               show_default=True,
               default=RECOGNITION_PRETRAIN_HYPER_PARAMS['optimizer'],
@@ -176,7 +181,7 @@
               help='Multiplicative factor for the logits used in contrastive loss.')
 @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
 def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
-             min_epochs, lag, min_delta, device, optimizer, lrate, momentum,
+             min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
              weight_decay, warmup, schedule, gamma, step_size, sched_patience,
              cos_max, partition, fixed_splits, training_files,
              evaluation_files, workers, load_hyper_parameters, repolygonize,
@@ -271,6 +276,7 @@ def pretrain(ctx, batch_size, pad, output, spec, load, freq, quit, epochs,
 
     trainer = KrakenTrainer(accelerator=accelerator,
                             devices=device,
+                            precision=precision,
                             max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1,
                             min_epochs=hyper_params['min_epochs'],
                             enable_progress_bar=True if not ctx.meta['verbose'] else False,

diff --git a/kraken/ketos/recognition.py b/kraken/ketos/recognition.py
@@ -76,6 +76,11 @@
               type=click.FLOAT,
               help='Minimum improvement between epochs to reset early stopping. Default is scales the delta by the best loss')
 @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)')
+@click.option('--precision',
+                show_default=True, 
+                default='16', 
+                type=click.Choice(['64', '32', 'bf16', '16']),
+                help='Numerical precision to use for training. Default is 16-bit mixed precision.')
 @click.option('--optimizer',
               show_default=True,
               default=RECOGNITION_HYPER_PARAMS['optimizer'],
@@ -181,7 +186,7 @@
               help='Path to directory where the logger will store the logs. If not set, a directory will be created in the current working directory.')
 @click.argument('ground_truth', nargs=-1, callback=_expand_gt, type=click.Path(exists=False, dir_okay=False))
 def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
-          min_epochs, lag, min_delta, device, optimizer, lrate, momentum,
+          min_epochs, lag, min_delta, device, precision, optimizer, lrate, momentum,
           weight_decay, warmup, freeze_backbone, schedule, gamma, step_size,
           sched_patience, cos_max, partition, fixed_splits, normalization,
           normalize_whitespace, codec, resize, reorder, base_dir,
@@ -293,6 +298,7 @@ def train(ctx, batch_size, pad, output, spec, append, load, freq, quit, epochs,
 
     trainer = KrakenTrainer(accelerator=accelerator,
                             devices=device,
+                            precision=precision,
                             max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1,
                             min_epochs=hyper_params['min_epochs'],
                             freeze_backbone=hyper_params['freeze_backbone'],

diff --git a/kraken/ketos/segmentation.py b/kraken/ketos/segmentation.py
@@ -98,7 +98,11 @@ def _validate_merging(ctx, param, value):
               type=click.FLOAT,
               help='Minimum improvement between epochs to reset early stopping. By default it scales the delta by the best loss')
 @click.option('-d', '--device', show_default=True, default='cpu', help='Select device to use (cpu, cuda:0, cuda:1, ...)')
-@click.option('--precision', default='32', type=click.Choice(['32', '16']), help='set tensor precision')
+@click.option('--precision',
+                show_default=True, 
+                default='16',
+                type=click.Choice(['64', '32', 'bf16', '16']),
+                help='Numerical precision to use for training. Default is 16-bit mixed precision.')
 @click.option('--optimizer',
               show_default=True,
               default=SEGMENTATION_HYPER_PARAMS['optimizer'],
@@ -332,6 +336,7 @@ def segtrain(ctx, output, spec, line_width, pad, load, freq, quit, epochs,
 
     trainer = KrakenTrainer(accelerator=accelerator,
                             devices=device,
+                            precision=precision,
                             max_epochs=hyper_params['epochs'] if hyper_params['quit'] == 'dumb' else -1,
                             min_epochs=hyper_params['min_epochs'],
                             enable_progress_bar=True if not ctx.meta['verbose'] else False,