From 6d7ebd9cb66a1211c25014b65c0455bdca79b9a7 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ewah1g13@soton.ac.uk>
Date: Tue, 23 Nov 2021 12:41:57 +0000
Subject: [PATCH] GPU CI Improvements (#992)

---
 .azure-pipelines/gpu-example-tests.yml        | 16 +++++
 .../{gpu-tests.yml => gpu-special-tests.yml}  | 24 +++----
 .azure-pipelines/testing-template.yml         | 69 +++++++++++++++++++
 .gitignore                                    |  1 +
 CHANGELOG.md                                  |  4 ++
 flash/core/finetuning.py                      |  2 -
 flash/core/model.py                           |  9 +++
 flash/core/trainer.py                         |  3 +
 flash/image/classification/model.py           |  2 +-
 flash/text/classification/data.py             |  2 +-
 flash/text/classification/model.py            |  2 +-
 flash/text/seq2seq/summarization/model.py     |  2 +-
 flash/text/seq2seq/translation/model.py       | 12 ++--
 .../image_classification_imagenette_mini.py   |  6 +-
 flash_examples/summarization.py               |  2 +-
 flash_examples/text_classification.py         |  2 +-
 flash_examples/translation.py                 |  2 +-
 tests/examples/test_integrations.py           |  9 ++-
 tests/examples/test_scripts.py                |  8 +--
 tests/examples/utils.py                       |  2 +-
 20 files changed, 138 insertions(+), 41 deletions(-)
 create mode 100644 .azure-pipelines/gpu-example-tests.yml
 rename .azure-pipelines/{gpu-tests.yml => gpu-special-tests.yml} (87%)
 create mode 100644 .azure-pipelines/testing-template.yml

diff --git a/.azure-pipelines/gpu-example-tests.yml b/.azure-pipelines/gpu-example-tests.yml
new file mode 100644
index 0000000000..58db4918d8
--- /dev/null
+++ b/.azure-pipelines/gpu-example-tests.yml
@@ -0,0 +1,16 @@
+trigger:
+  branches:
+    include: ["master"]
+pr:
+  branches:
+    include: ["master"]
+  autoCancel: true
+  drafts: true
+
+jobs:
+- template: testing-template.yml
+  parameters:
+    configs:
+    - "image"
+    - "text"
+    - "tabular"
diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-special-tests.yml
similarity index 87%
rename from .azure-pipelines/gpu-tests.yml
rename to .azure-pipelines/gpu-special-tests.yml
index 3f8ba3652f..d18c237efe 100644
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-special-tests.yml
@@ -2,19 +2,19 @@
 # Create and test a Python package on multiple Python versions.
 # Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
 # https://docs.microsoft.com/azure/devops/pipelines/languages/python
-trigger: none
-
-pr: none
-
-schedules:
-- cron: "0 0 * * *"
-  displayName: Daily midnight testing
+trigger:
+  tags:
+    include:
+      - '*'
   branches:
     include:
-    - master
+      - "master"
+      - "refs/tags/*"
+pr:
+  - "master"
 
 jobs:
-  - job: pytest
+  - job: special
     # how long to run the job before automatically cancelling
     timeoutInMinutes: 45
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -50,15 +50,11 @@ jobs:
 
     - bash: |
         # python -m pip install "pip==20.1"
-        pip install '.[all]'
+        pip install '.[image]' learn2learn
         pip install '.[test]' --upgrade-strategy only-if-needed
         pip list
       displayName: 'Install dependencies'
 
-    - bash: |
-        python -m coverage run --source flash -m pytest flash tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
-      displayName: 'Testing'
-
     - bash: |
         bash tests/special_tests.sh
       displayName: 'Testing: special'
diff --git a/.azure-pipelines/testing-template.yml b/.azure-pipelines/testing-template.yml
new file mode 100644
index 0000000000..50e9f540cb
--- /dev/null
+++ b/.azure-pipelines/testing-template.yml
@@ -0,0 +1,69 @@
+jobs:
+- ${{ each config in parameters.configs }}:
+  - job:
+    displayName: ${{config}}
+    # how long to run the job before automatically cancelling
+    timeoutInMinutes: 45
+    # how much time to give 'run always even if cancelled tasks' before stopping them
+    cancelTimeoutInMinutes: 2
+
+    pool: azure-gpus-spot
+    # this need to have installed docker in the base image...
+    container:
+      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.8"
+      # image: "pytorch/pytorch:1.8.1-cuda11.0-cudnn8-runtime"
+      options: "-it --rm --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
+
+    workspace:
+      clean: all
+    steps:
+
+    - bash: |
+        lspci | egrep 'VGA|3D'
+        whereis nvidia
+        nvidia-smi
+        python --version
+        pip --version
+        pip list
+        df -kh /dev/shm
+      displayName: 'Image info & NVIDIA'
+
+    - bash: |
+        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
+      displayName: 'Sanity check'
+
+    - bash: |
+        # python -m pip install "pip==20.1"
+        pip install '.[${{config}}]'
+        pip install '.[test]' --upgrade-strategy only-if-needed
+        pip list
+      displayName: 'Install dependencies'
+
+    - bash: |
+        python -m coverage run --source flash -m pytest flash tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
+      displayName: 'Testing'
+
+    - bash: |
+        python -m coverage report
+        python -m coverage xml
+        python -m coverage html
+        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+        ls -l
+      displayName: 'Statistics'
+
+    - task: PublishTestResults@2
+      displayName: 'Publish test results'
+      inputs:
+        testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
+        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
+      condition: succeededOrFailed()
+
+    - task: PublishCodeCoverageResults@1
+      displayName: 'Publish coverage report'
+      inputs:
+        codeCoverageTool: 'cobertura'
+        summaryFileLocation: 'coverage.xml'
+        reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
+        testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
+      condition: succeededOrFailed()
diff --git a/.gitignore b/.gitignore
index ce993720d2..c4ebcfd822 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,4 @@ urban8k_images/
 __MACOSX
 *-v2.0.json
 cifar-10*
+mini-imagenet*
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d26cdcd97c..45ebfa166e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed a bug where Flash could not be used with IceVision 0.11.0 ([#989](https://github.com/PyTorchLightning/lightning-flash/pull/989))
 
+- Fixed a bug where backbone weights were sometimes not frozen correctly ([#992](https://github.com/PyTorchLightning/lightning-flash/pull/992))
+
+- Fixed a bug where translation metrics were not computed correctly ([#992](https://github.com/PyTorchLightning/lightning-flash/pull/992))
+
 ### Removed
 
 - Removed `OutputMapping` ([#939](https://github.com/PyTorchLightning/lightning-flash/pull/939))
diff --git a/flash/core/finetuning.py b/flash/core/finetuning.py
index 8e08e095ba..12bfe7907d 100644
--- a/flash/core/finetuning.py
+++ b/flash/core/finetuning.py
@@ -158,8 +158,6 @@ def finetune_function(
             self._freeze_unfreeze_function(pl_module, epoch, optimizer, opt_idx, self.strategy_metadata)
         elif self.strategy == FinetuningStrategies.UNFREEZE_MILESTONES:
             self._unfreeze_milestones_function(pl_module, epoch, optimizer, opt_idx, self.strategy_metadata)
-        else:
-            pass
 
 
 # Used for properly verifying input and providing neat and helpful error messages for users.
diff --git a/flash/core/model.py b/flash/core/model.py
index 5ed9b99a63..073c9baa1c 100644
--- a/flash/core/model.py
+++ b/flash/core/model.py
@@ -535,6 +535,15 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
             batch = torch.stack(batch)
         return self(batch)
 
+    def modules_to_freeze(self) -> Optional[Union[nn.Module]]:
+        """By default, we try to get the ``backbone`` attribute from the task and return it or ``None`` if not
+        present.
+
+        Returns:
+            The backbone ``Module`` to freeze or ``None`` if this task does not have a ``backbone`` attribute.
+        """
+        return getattr(self, "backbone", None)
+
     def _get_optimizer_class_from_registry(self, optimizer_key: str) -> Optimizer:
         if optimizer_key.lower() not in self.available_optimizers():
             raise KeyError(
diff --git a/flash/core/trainer.py b/flash/core/trainer.py
index 3217a98b72..34cce32db1 100644
--- a/flash/core/trainer.py
+++ b/flash/core/trainer.py
@@ -85,6 +85,9 @@ def __init__(self, *args, serve_sanity_check: bool = False, **kwargs):
                 kwargs["fast_dev_run"] = False
             else:
                 kwargs["fast_dev_run"] = True
+                kwargs["gpus"] = None
+                kwargs["accelerator"] = None
+                kwargs["precision"] = 32
         super().__init__(*args, **kwargs)
 
         self.serve_sanity_check = serve_sanity_check
diff --git a/flash/image/classification/model.py b/flash/image/classification/model.py
index 90e0181523..8fd6ec742b 100644
--- a/flash/image/classification/model.py
+++ b/flash/image/classification/model.py
@@ -152,6 +152,6 @@ def available_pretrained_weights(cls, backbone: str):
     def _ci_benchmark_fn(self, history: List[Dict[str, Any]]):
         """This function is used only for debugging usage with CI."""
         if self.hparams.multi_label:
-            assert history[-1]["val_f1"] > 0.40, history[-1]["val_f1"]
+            assert history[-1]["val_f1"] > 0.30, history[-1]["val_f1"]
         else:
             assert history[-1]["val_accuracy"] > 0.85, history[-1]["val_accuracy"]
diff --git a/flash/text/classification/data.py b/flash/text/classification/data.py
index b0e4ca5841..6309d951c9 100644
--- a/flash/text/classification/data.py
+++ b/flash/text/classification/data.py
@@ -259,7 +259,7 @@ def __init__(
         val_transform: Optional[Dict[str, Callable]] = None,
         test_transform: Optional[Dict[str, Callable]] = None,
         predict_transform: Optional[Dict[str, Callable]] = None,
-        backbone: str = "prajjwal1/bert-tiny",
+        backbone: str = "prajjwal1/bert-medium",
         max_length: int = 128,
     ):
         self.backbone = backbone
diff --git a/flash/text/classification/model.py b/flash/text/classification/model.py
index 1e0b8f3abb..4a468295c4 100644
--- a/flash/text/classification/model.py
+++ b/flash/text/classification/model.py
@@ -57,7 +57,7 @@ class TextClassifier(ClassificationTask):
     def __init__(
         self,
         num_classes: int,
-        backbone: str = "prajjwal1/bert-tiny",
+        backbone: str = "prajjwal1/bert-medium",
         loss_fn: LOSS_FN_TYPE = None,
         optimizer: OPTIMIZER_TYPE = "Adam",
         lr_scheduler: LR_SCHEDULER_TYPE = None,
diff --git a/flash/text/seq2seq/summarization/model.py b/flash/text/seq2seq/summarization/model.py
index 6067eb5ceb..3b0147465f 100644
--- a/flash/text/seq2seq/summarization/model.py
+++ b/flash/text/seq2seq/summarization/model.py
@@ -89,4 +89,4 @@ def compute_metrics(self, generated_tokens: torch.Tensor, batch: Dict, prefix: s
     @staticmethod
     def _ci_benchmark_fn(history: List[Dict[str, Any]]):
         """This function is used only for debugging usage with CI."""
-        assert history[-1]["rouge1_recall"] > 0.2
+        assert history[-1]["rouge1_recall"] > 0.18, history[-1]["rouge1_recall"]
diff --git a/flash/text/seq2seq/translation/model.py b/flash/text/seq2seq/translation/model.py
index 553adb6b7a..d93e03ee04 100644
--- a/flash/text/seq2seq/translation/model.py
+++ b/flash/text/seq2seq/translation/model.py
@@ -81,13 +81,17 @@ def task(self) -> str:
         return "translation"
 
     def compute_metrics(self, generated_tokens, batch, prefix):
-        tgt_lns = self.tokenize_labels(batch["labels"])
+        reference_corpus = self.tokenize_labels(batch["labels"])
         # wrap targets in list as score expects a list of potential references
-        tgt_lns = [[reference] for reference in tgt_lns]
-        result = self.bleu(self._output_transform.uncollate(generated_tokens), tgt_lns)
+        reference_corpus = [[reference] for reference in reference_corpus]
+
+        translate_corpus = self._output_transform.uncollate(generated_tokens)
+        translate_corpus = [line for line in translate_corpus]
+
+        result = self.bleu(reference_corpus, translate_corpus)
         self.log(f"{prefix}_bleu_score", result, on_step=False, on_epoch=True, prog_bar=True)
 
     @staticmethod
     def _ci_benchmark_fn(history: List[Dict[str, Any]]):
         """This function is used only for debugging usage with CI."""
-        assert history[-1]["val_bleu_score"] > 0.6
+        assert history[-1]["val_bleu_score"] > 0.6, history[-1]["val_bleu_score"]
diff --git a/flash_examples/integrations/learn2learn/image_classification_imagenette_mini.py b/flash_examples/integrations/learn2learn/image_classification_imagenette_mini.py
index 00890af201..1459acca63 100644
--- a/flash_examples/integrations/learn2learn/image_classification_imagenette_mini.py
+++ b/flash_examples/integrations/learn2learn/image_classification_imagenette_mini.py
@@ -33,7 +33,6 @@
 # download MiniImagenet
 train_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="train", download=True)
 val_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="validation", download=True)
-test_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="test", download=True)
 
 train_transform = {
     "to_tensor_transform": nn.Sequential(
@@ -69,9 +68,6 @@
     train_targets=torch.from_numpy(train_dataset.y.astype(int)),
     val_data=val_dataset.x,
     val_targets=torch.from_numpy(val_dataset.y.astype(int)),
-    test_data=test_dataset.x,
-    test_targets=torch.from_numpy(test_dataset.y.astype(int)),
-    num_workers=4,
     train_transform=train_transform,
 )
 
@@ -90,7 +86,7 @@
         "test_queries": 15,
     },
     optimizer=torch.optim.Adam,
-    optimizer_kwargs={"lr": 0.001},
+    learning_rate=0.001,
 )
 
 trainer = flash.Trainer(
diff --git a/flash_examples/summarization.py b/flash_examples/summarization.py
index c032258fbd..5433805be3 100644
--- a/flash_examples/summarization.py
+++ b/flash_examples/summarization.py
@@ -30,7 +30,7 @@
 
 # 3. Create the trainer and finetune the model
 trainer = Trainer(max_epochs=3)
-trainer.finetune(model, datamodule=datamodule)
+trainer.finetune(model, datamodule=datamodule, strategy="freeze")
 
 # 4. Summarize some text!
 predictions = model.predict(
diff --git a/flash_examples/text_classification.py b/flash_examples/text_classification.py
index bdeedbeb94..9e8a0b6856 100644
--- a/flash_examples/text_classification.py
+++ b/flash_examples/text_classification.py
@@ -25,7 +25,7 @@
     "sentiment",
     train_file="data/imdb/train.csv",
     val_file="data/imdb/valid.csv",
-    backbone="prajjwal1/bert-tiny",
+    backbone="prajjwal1/bert-medium",
 )
 
 # 2. Build the task
diff --git a/flash_examples/translation.py b/flash_examples/translation.py
index fc82bb767a..30f7c3053a 100644
--- a/flash_examples/translation.py
+++ b/flash_examples/translation.py
@@ -33,7 +33,7 @@
 
 # 3. Create the trainer and finetune the model
 trainer = flash.Trainer(max_epochs=3, gpus=torch.cuda.device_count())
-trainer.finetune(model, datamodule=datamodule)
+trainer.finetune(model, datamodule=datamodule, strategy="freeze")
 
 # 4. Translate something!
 predictions = model.predict(
diff --git a/tests/examples/test_integrations.py b/tests/examples/test_integrations.py
index 4923099df4..c7f66f8207 100644
--- a/tests/examples/test_integrations.py
+++ b/tests/examples/test_integrations.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from flash.core.utilities.imports import _BAAL_AVAILABLE, _FIFTYONE_AVAILABLE, _IMAGE_AVAILABLE
+from flash.core.utilities.imports import _BAAL_AVAILABLE, _FIFTYONE_AVAILABLE, _IMAGE_AVAILABLE, _LEARN2LEARN_AVAILABLE
 from tests.examples.utils import run_test
 
 root = Path(__file__).parent.parent.parent
@@ -39,6 +39,13 @@
             "image_classification_active_learning.py",
             marks=pytest.mark.skipif(not (_IMAGE_AVAILABLE and _BAAL_AVAILABLE), reason="baal library isn't installed"),
         ),
+        pytest.param(
+            "learn2learn",
+            "image_classification_imagenette_mini.py",
+            marks=pytest.mark.skipif(
+                not (_IMAGE_AVAILABLE and _LEARN2LEARN_AVAILABLE), reason="learn2learn isn't installed"
+            ),
+        ),
     ],
 )
 def test_integrations(tmpdir, folder, file):
diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py
index 6593863fc9..033ee35b3d 100644
--- a/tests/examples/test_scripts.py
+++ b/tests/examples/test_scripts.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from flash.core.utilities.imports import _LEARN2LEARN_AVAILABLE, _SKLEARN_AVAILABLE
+from flash.core.utilities.imports import _SKLEARN_AVAILABLE
 from tests.examples.utils import run_test
 from tests.helpers.utils import (
     _AUDIO_TESTING,
@@ -52,12 +52,6 @@
             "image_classification_multi_label.py",
             marks=pytest.mark.skipif(not _IMAGE_TESTING, reason="image libraries aren't installed"),
         ),
-        pytest.param(
-            "image_classification_meta_learning.py.py",
-            marks=pytest.mark.skipif(
-                not (_IMAGE_TESTING and _LEARN2LEARN_AVAILABLE), reason="image/learn2learn libraries aren't installed"
-            ),
-        ),
         # pytest.param("finetuning", "object_detection.py"),  # TODO: takes too long.
         pytest.param(
             "question_answering.py",
diff --git a/tests/examples/utils.py b/tests/examples/utils.py
index cf713fcbd1..6a8ef4dbb3 100644
--- a/tests/examples/utils.py
+++ b/tests/examples/utils.py
@@ -51,4 +51,4 @@ def run_test(filepath):
     code, stdout, stderr = call_script(filepath)
     print(f"{filepath} STDOUT: {stdout}")
     print(f"{filepath} STDERR: {stderr}")
-    assert not code
+    assert not code, code