Update warnings in TrainingTricksConnector (Lightning-AI#9595)

* update warnings * add tests * comments * Apply suggestions from code review * Apply suggestions from code review
speediedan · Sep 28, 2021 · c8749bf · c8749bf
1 parent ddf6967
commit c8749bf
Show file tree

Hide file tree

Showing 22 changed files with 1,687 additions and 46 deletions.
diff --git a/pl_examples/__init__.py b/pl_examples/__init__.py
@@ -14,6 +14,7 @@
 _DATASETS_PATH = os.path.join(_PACKAGE_ROOT, "Datasets")
 
 _DALI_AVAILABLE = _module_available("nvidia.dali")
+_HF_AVAILABLE = _module_available("transformers") and _module_available("datasets")
 
 LIGHTNING_LOGO = """
                     ####

diff --git a/pl_examples/basic_examples/fts_configs/RteBoolqModule_thaw_schedule_albert_base_v2.yaml b/pl_examples/basic_examples/fts_configs/RteBoolqModule_thaw_schedule_albert_base_v2.yaml
@@ -0,0 +1,31 @@
+0:
+- model.classifier.bias
+- model.classifier.weight
+1:
+- model.albert.pooler.bias
+- model.albert.pooler.weight
+2:
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn_output.weight
+3:
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.ffn.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.LayerNorm.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.dense.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.value.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.key.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.attention.query.weight
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.bias
+- model.albert.encoder.albert_layer_groups.0.albert_layers.0.full_layer_layer_norm.weight
+- model.albert.encoder.embedding_hidden_mapping_in.bias
+- model.albert.encoder.embedding_hidden_mapping_in.weight
+- model.albert.embeddings.LayerNorm.bias
+- model.albert.embeddings.LayerNorm.weight
+- model.albert.embeddings.token_type_embeddings.weight
+- model.albert.embeddings.position_embeddings.weight
+- model.albert.embeddings.word_embeddings.weight
diff --git a/pl_examples/basic_examples/fts_configs/fts_defaults.yaml b/pl_examples/basic_examples/fts_configs/fts_defaults.yaml
@@ -0,0 +1,46 @@
+seed_everything: 42
+# NEXT: create new pldev branch and run tests from pytorch_lightning branch
+# consider using dataclasses for LM and LDM configuration since signature getting polluted
+# potentially switch to a simpler test model in pl_examples or explore using boolq squad since might illuminate bugs
+# move to main PL dev env and run from pl_examples!
+debug_cfg:
+  lrs_test_mode: false
+  dev_debug: false
+data:
+  class_path: pl_examples.basic_examples.fts_super_glue.RteBoolqDataModule
+  init_args:
+    model_name_or_path: albert-base-v2
+    task_name: rte
+    prep_on_init: false
+    num_workers: 0
+    pin_memory: false
+    tokenizers_parallelism: 'false'
+    max_seq_length: 128
+    train_batch_size: 32
+    eval_batch_size: 32
+model:
+  class_path: pl_examples.basic_examples.fts_super_glue.RteBoolqModule
+  init_args:
+    optimizer_init:
+      class_path: torch.optim.AdamW
+      init_args:
+        weight_decay: 1.0e-05
+        eps: 1.0e-07
+        lr: 1.0e-05
+    lr_scheduler_init:
+      class_path: torch.optim.lr_scheduler.CosineAnnealingWarmRestarts
+      init_args:
+        T_0: 1
+        T_mult: 2
+        eta_min: 0
+    pl_lrs_cfg:
+      interval: epoch
+      frequency: 1
+      name: CosineAnnealingWithWarmRestartsLR
+trainer:
+  plugins: ddp_find_unused_parameters_false  # use registered version of DDP with find_unused_parameters set to false
+  max_epochs: 100
+  gpus: 2
+  accelerator: ddp
+  log_gpu_memory: all
+  precision: 16
diff --git a/pl_examples/basic_examples/fts_configs/fts_explicit.yaml b/pl_examples/basic_examples/fts_configs/fts_explicit.yaml
@@ -0,0 +1,28 @@
+trainer:
+  #resume_from_checkpoint: /home/speediedan/repos/pytorch-lightning/lightning_logs/fts_explicit/version_14/checkpoints/epoch=32-step=1286.ckpt
+  #resume_from_checkpoint: /home/speediedan/repos/pytorch-lightning/lightning_logs/fts_explicit/version_24/checkpoints/epoch=17-step=701.ckpt
+  callbacks:
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.FinetuningScheduler
+    init_args:
+      thaw_schedule: ./pl_examples/basic_examples/fts_configs/RteBoolqModule_thaw_schedule_albert_base_v2.yaml
+      base_max_lr: 1.0e-05
+      dump_model_thaw_sched_only: false
+      max_depth: null
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.FTSModelCheckpoint
+    init_args:
+        save_top_k: 5
+        monitor: val_loss
+        verbose: true
+  - class_path: pytorch_lightning.callbacks.EarlyStopping
+    init_args:
+      monitor: val_loss
+      min_delta: 0.001 # big delta for now to test instead of 0.001
+      patience: 2 # limited patience for testing
+      verbose: false
+      mode: min
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.FTSLearningRateMonitor
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: lightning_logs
+      name: fts_explicit
diff --git a/pl_examples/basic_examples/fts_configs/fts_implicit.yaml b/pl_examples/basic_examples/fts_configs/fts_implicit.yaml
@@ -0,0 +1,27 @@
+trainer:
+  #resume_from_checkpoint: /home/speediedan/repos/pytorch-lightning/lightning_logs/fts_implicit/version_16/checkpoints/epoch=36-step=1442.ckpt
+  callbacks:
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.FinetuningScheduler
+    init_args:
+      base_max_lr: 1.0e-05
+      dump_model_thaw_sched_only: false
+      #restore_best: false
+      max_depth: null
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.FTSModelCheckpoint
+    init_args:
+        save_top_k: 5
+        monitor: val_loss
+        verbose: true
+  - class_path: pytorch_lightning.callbacks.EarlyStopping
+    init_args:
+      monitor: val_loss
+      min_delta: 0.001
+      patience: 2
+      verbose: false
+      mode: min
+  - class_path: pytorch_lightning.callbacks.LearningRateMonitor
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: lightning_logs
+      name: fts_implicit
diff --git a/pl_examples/basic_examples/fts_configs/nofts_baseline.yaml b/pl_examples/basic_examples/fts_configs/nofts_baseline.yaml
@@ -0,0 +1,15 @@
+trainer:
+  #resume_from_checkpoint: /home/speediedan/repos/pytorch-lightning/lightning_logs/nofts_baseline/version_3/checkpoints/epoch=4-step=194.ckpt
+  callbacks:
+  - class_path: pytorch_lightning.callbacks.EarlyStopping
+    init_args:
+      monitor: val_loss
+      min_delta: 0.001
+      patience: 2
+      verbose: false
+      mode: min
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: lightning_logs
+      name: nofts_baseline
diff --git a/pl_examples/basic_examples/fts_configs/nofts_milestone_ft.yaml b/pl_examples/basic_examples/fts_configs/nofts_milestone_ft.yaml
@@ -0,0 +1,18 @@
+# TODO: add bug regarding appending of callbacks instead of overriding?
+trainer:
+  callbacks:
+  - class_path: pytorch_lightning.callbacks.EarlyStopping
+    init_args:
+      monitor: val_loss
+      min_delta: 0.001
+      patience: 2
+      verbose: false
+      mode: min
+  - class_path: pytorch_lightning.callbacks.finetuning_scheduler.MilestonesFinetuning
+    init_args:
+      milestones: [2, 4]
+  logger:
+    class_path: pytorch_lightning.loggers.TensorBoardLogger
+    init_args:
+      save_dir: lightning_logs
+      name: nofts_milestone_ft