Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
GPU CI Improvements (#992)
Browse files Browse the repository at this point in the history
  • Loading branch information
ethanwharris authored Nov 23, 2021
1 parent 1489f47 commit 6d7ebd9
Show file tree
Hide file tree
Showing 20 changed files with 138 additions and 41 deletions.
16 changes: 16 additions & 0 deletions .azure-pipelines/gpu-example-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
trigger:
branches:
include: ["master"]
pr:
branches:
include: ["master"]
autoCancel: true
drafts: true

jobs:
- template: testing-template.yml
parameters:
configs:
- "image"
- "text"
- "tabular"
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
# Create and test a Python package on multiple Python versions.
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
trigger: none

pr: none

schedules:
- cron: "0 0 * * *"
displayName: Daily midnight testing
trigger:
tags:
include:
- '*'
branches:
include:
- master
- "master"
- "refs/tags/*"
pr:
- "master"

jobs:
- job: pytest
- job: special
# how long to run the job before automatically cancelling
timeoutInMinutes: 45
# how much time to give 'run always even if cancelled tasks' before stopping them
Expand Down Expand Up @@ -50,15 +50,11 @@ jobs:
- bash: |
# python -m pip install "pip==20.1"
pip install '.[all]'
pip install '.[image]' learn2learn
pip install '.[test]' --upgrade-strategy only-if-needed
pip list
displayName: 'Install dependencies'
- bash: |
python -m coverage run --source flash -m pytest flash tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
displayName: 'Testing'
- bash: |
bash tests/special_tests.sh
displayName: 'Testing: special'
Expand Down
69 changes: 69 additions & 0 deletions .azure-pipelines/testing-template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
jobs:
- ${{ each config in parameters.configs }}:
- job:
displayName: ${{config}}
# how long to run the job before automatically cancelling
timeoutInMinutes: 45
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: 2

pool: azure-gpus-spot
# this need to have installed docker in the base image...
container:
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.8"
# image: "pytorch/pytorch:1.8.1-cuda11.0-cudnn8-runtime"
options: "-it --rm --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"

workspace:
clean: all
steps:

- bash: |
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
python --version
pip --version
pip list
df -kh /dev/shm
displayName: 'Image info & NVIDIA'
- bash: |
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'GPU: {mgpu}'"
displayName: 'Sanity check'
- bash: |
# python -m pip install "pip==20.1"
pip install '.[${{config}}]'
pip install '.[test]' --upgrade-strategy only-if-needed
pip list
displayName: 'Install dependencies'
- bash: |
python -m coverage run --source flash -m pytest flash tests/examples/test_scripts.py -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=30
displayName: 'Testing'
- bash: |
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
ls -l
displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()

- task: PublishCodeCoverageResults@1
displayName: 'Publish coverage report'
inputs:
codeCoverageTool: 'cobertura'
summaryFileLocation: 'coverage.xml'
reportDirectory: '$(Build.SourcesDirectory)/htmlcov'
testRunTitle: '$(Agent.OS) - $(Build.BuildNumber)[$(Agent.JobName)] - Python $(python.version)'
condition: succeededOrFailed()
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,4 @@ urban8k_images/
__MACOSX
*-v2.0.json
cifar-10*
mini-imagenet*
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Fixed a bug where Flash could not be used with IceVision 0.11.0 ([#989](https://github.com/PyTorchLightning/lightning-flash/pull/989))

- Fixed a bug where backbone weights were sometimes not frozen correctly ([#992](https://github.com/PyTorchLightning/lightning-flash/pull/992))

- Fixed a bug where translation metrics were not computed correctly ([#992](https://github.com/PyTorchLightning/lightning-flash/pull/992))

### Removed

- Removed `OutputMapping` ([#939](https://github.com/PyTorchLightning/lightning-flash/pull/939))
Expand Down
2 changes: 0 additions & 2 deletions flash/core/finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ def finetune_function(
self._freeze_unfreeze_function(pl_module, epoch, optimizer, opt_idx, self.strategy_metadata)
elif self.strategy == FinetuningStrategies.UNFREEZE_MILESTONES:
self._unfreeze_milestones_function(pl_module, epoch, optimizer, opt_idx, self.strategy_metadata)
else:
pass


# Used for properly verifying input and providing neat and helpful error messages for users.
Expand Down
9 changes: 9 additions & 0 deletions flash/core/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,15 @@ def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> A
batch = torch.stack(batch)
return self(batch)

def modules_to_freeze(self) -> Optional[Union[nn.Module]]:
"""By default, we try to get the ``backbone`` attribute from the task and return it or ``None`` if not
present.
Returns:
The backbone ``Module`` to freeze or ``None`` if this task does not have a ``backbone`` attribute.
"""
return getattr(self, "backbone", None)

def _get_optimizer_class_from_registry(self, optimizer_key: str) -> Optimizer:
if optimizer_key.lower() not in self.available_optimizers():
raise KeyError(
Expand Down
3 changes: 3 additions & 0 deletions flash/core/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def __init__(self, *args, serve_sanity_check: bool = False, **kwargs):
kwargs["fast_dev_run"] = False
else:
kwargs["fast_dev_run"] = True
kwargs["gpus"] = None
kwargs["accelerator"] = None
kwargs["precision"] = 32
super().__init__(*args, **kwargs)

self.serve_sanity_check = serve_sanity_check
Expand Down
2 changes: 1 addition & 1 deletion flash/image/classification/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,6 @@ def available_pretrained_weights(cls, backbone: str):
def _ci_benchmark_fn(self, history: List[Dict[str, Any]]):
"""This function is used only for debugging usage with CI."""
if self.hparams.multi_label:
assert history[-1]["val_f1"] > 0.40, history[-1]["val_f1"]
assert history[-1]["val_f1"] > 0.30, history[-1]["val_f1"]
else:
assert history[-1]["val_accuracy"] > 0.85, history[-1]["val_accuracy"]
2 changes: 1 addition & 1 deletion flash/text/classification/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def __init__(
val_transform: Optional[Dict[str, Callable]] = None,
test_transform: Optional[Dict[str, Callable]] = None,
predict_transform: Optional[Dict[str, Callable]] = None,
backbone: str = "prajjwal1/bert-tiny",
backbone: str = "prajjwal1/bert-medium",
max_length: int = 128,
):
self.backbone = backbone
Expand Down
2 changes: 1 addition & 1 deletion flash/text/classification/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class TextClassifier(ClassificationTask):
def __init__(
self,
num_classes: int,
backbone: str = "prajjwal1/bert-tiny",
backbone: str = "prajjwal1/bert-medium",
loss_fn: LOSS_FN_TYPE = None,
optimizer: OPTIMIZER_TYPE = "Adam",
lr_scheduler: LR_SCHEDULER_TYPE = None,
Expand Down
2 changes: 1 addition & 1 deletion flash/text/seq2seq/summarization/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,4 @@ def compute_metrics(self, generated_tokens: torch.Tensor, batch: Dict, prefix: s
@staticmethod
def _ci_benchmark_fn(history: List[Dict[str, Any]]):
"""This function is used only for debugging usage with CI."""
assert history[-1]["rouge1_recall"] > 0.2
assert history[-1]["rouge1_recall"] > 0.18, history[-1]["rouge1_recall"]
12 changes: 8 additions & 4 deletions flash/text/seq2seq/translation/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,17 @@ def task(self) -> str:
return "translation"

def compute_metrics(self, generated_tokens, batch, prefix):
tgt_lns = self.tokenize_labels(batch["labels"])
reference_corpus = self.tokenize_labels(batch["labels"])
# wrap targets in list as score expects a list of potential references
tgt_lns = [[reference] for reference in tgt_lns]
result = self.bleu(self._output_transform.uncollate(generated_tokens), tgt_lns)
reference_corpus = [[reference] for reference in reference_corpus]

translate_corpus = self._output_transform.uncollate(generated_tokens)
translate_corpus = [line for line in translate_corpus]

result = self.bleu(reference_corpus, translate_corpus)
self.log(f"{prefix}_bleu_score", result, on_step=False, on_epoch=True, prog_bar=True)

@staticmethod
def _ci_benchmark_fn(history: List[Dict[str, Any]]):
"""This function is used only for debugging usage with CI."""
assert history[-1]["val_bleu_score"] > 0.6
assert history[-1]["val_bleu_score"] > 0.6, history[-1]["val_bleu_score"]
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
# download MiniImagenet
train_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="train", download=True)
val_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="validation", download=True)
test_dataset = l2l.vision.datasets.MiniImagenet(root="data", mode="test", download=True)

train_transform = {
"to_tensor_transform": nn.Sequential(
Expand Down Expand Up @@ -69,9 +68,6 @@
train_targets=torch.from_numpy(train_dataset.y.astype(int)),
val_data=val_dataset.x,
val_targets=torch.from_numpy(val_dataset.y.astype(int)),
test_data=test_dataset.x,
test_targets=torch.from_numpy(test_dataset.y.astype(int)),
num_workers=4,
train_transform=train_transform,
)

Expand All @@ -90,7 +86,7 @@
"test_queries": 15,
},
optimizer=torch.optim.Adam,
optimizer_kwargs={"lr": 0.001},
learning_rate=0.001,
)

trainer = flash.Trainer(
Expand Down
2 changes: 1 addition & 1 deletion flash_examples/summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

# 3. Create the trainer and finetune the model
trainer = Trainer(max_epochs=3)
trainer.finetune(model, datamodule=datamodule)
trainer.finetune(model, datamodule=datamodule, strategy="freeze")

# 4. Summarize some text!
predictions = model.predict(
Expand Down
2 changes: 1 addition & 1 deletion flash_examples/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"sentiment",
train_file="data/imdb/train.csv",
val_file="data/imdb/valid.csv",
backbone="prajjwal1/bert-tiny",
backbone="prajjwal1/bert-medium",
)

# 2. Build the task
Expand Down
2 changes: 1 addition & 1 deletion flash_examples/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

# 3. Create the trainer and finetune the model
trainer = flash.Trainer(max_epochs=3, gpus=torch.cuda.device_count())
trainer.finetune(model, datamodule=datamodule)
trainer.finetune(model, datamodule=datamodule, strategy="freeze")

# 4. Translate something!
predictions = model.predict(
Expand Down
9 changes: 8 additions & 1 deletion tests/examples/test_integrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pytest

from flash.core.utilities.imports import _BAAL_AVAILABLE, _FIFTYONE_AVAILABLE, _IMAGE_AVAILABLE
from flash.core.utilities.imports import _BAAL_AVAILABLE, _FIFTYONE_AVAILABLE, _IMAGE_AVAILABLE, _LEARN2LEARN_AVAILABLE
from tests.examples.utils import run_test

root = Path(__file__).parent.parent.parent
Expand All @@ -39,6 +39,13 @@
"image_classification_active_learning.py",
marks=pytest.mark.skipif(not (_IMAGE_AVAILABLE and _BAAL_AVAILABLE), reason="baal library isn't installed"),
),
pytest.param(
"learn2learn",
"image_classification_imagenette_mini.py",
marks=pytest.mark.skipif(
not (_IMAGE_AVAILABLE and _LEARN2LEARN_AVAILABLE), reason="learn2learn isn't installed"
),
),
],
)
def test_integrations(tmpdir, folder, file):
Expand Down
8 changes: 1 addition & 7 deletions tests/examples/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pytest

from flash.core.utilities.imports import _LEARN2LEARN_AVAILABLE, _SKLEARN_AVAILABLE
from flash.core.utilities.imports import _SKLEARN_AVAILABLE
from tests.examples.utils import run_test
from tests.helpers.utils import (
_AUDIO_TESTING,
Expand Down Expand Up @@ -52,12 +52,6 @@
"image_classification_multi_label.py",
marks=pytest.mark.skipif(not _IMAGE_TESTING, reason="image libraries aren't installed"),
),
pytest.param(
"image_classification_meta_learning.py.py",
marks=pytest.mark.skipif(
not (_IMAGE_TESTING and _LEARN2LEARN_AVAILABLE), reason="image/learn2learn libraries aren't installed"
),
),
# pytest.param("finetuning", "object_detection.py"), # TODO: takes too long.
pytest.param(
"question_answering.py",
Expand Down
2 changes: 1 addition & 1 deletion tests/examples/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def run_test(filepath):
code, stdout, stderr = call_script(filepath)
print(f"{filepath} STDOUT: {stdout}")
print(f"{filepath} STDERR: {stderr}")
assert not code
assert not code, code

0 comments on commit 6d7ebd9

Please sign in to comment.