Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci/gpu: debug skipped cache #2709

Merged
merged 25 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .azure/gpu-nuke-cache.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
trigger:
tags:
include:
- "*"
# run every month to sanitatize dev environment
schedules:
- cron: "0 0 1 * *"
displayName: Monthly nuke caches
branches:
include:
- master
# run on PR changing only this file
pr:
branches:
include:
- master
paths:
include:
- .azure/gpu-nuke-cache.yml

jobs:
- job: nuke_caches
# how long to run the job before automatically cancelling
timeoutInMinutes: "10"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"

pool: "lit-rtx-3090"

variables:
# these two caches assume to run repetitively on the same set of machines
# see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
TORCH_HOME: "/var/tmp/torch"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
CACHED_REFERENCES: "/var/tmp/cached-references.zip"

container:
image: "ubuntu:22.04"
options: "-v /var/tmp:/var/tmp"

steps:
- bash: |
set -ex
rm -rf $(TORCH_HOME)
rm -rf $(TRANSFORMERS_CACHE)
rm -rf $(HF_HOME)
rm -rf $(HF_HUB_CACHE)
rm -rf $(PIP_CACHE_DIR)
rm -rf $(CACHED_REFERENCES)
displayName: "delete all caches"
- bash: |
ls -lh /var/tmp
displayName: "show tmp/ folder"
47 changes: 31 additions & 16 deletions .azure/gpu-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ trigger:
- master
- release/*
- refs/tags/*
# run every month to populate caches
schedules:
- cron: "0 1 1 * *"
displayName: Monthly re-build caches
branches:
include:
- master
pr:
- master
- release/*
Expand Down Expand Up @@ -67,6 +74,11 @@ jobs:
CUDA_version_mm="${CUDA_version//'.'/''}"
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
mkdir -p $(TORCH_HOME)
mkdir -p $(TRANSFORMERS_CACHE)
mkdir -p $(HF_HOME)
mkdir -p $(HF_HUB_CACHE)
mkdir -p $(PIP_CACHE_DIR)
displayName: "set Env. vars"
- bash: |
echo "##vso[task.setvariable variable=ALLOW_SKIP_IF_OUT_OF_MEMORY]1"
Expand Down Expand Up @@ -111,7 +123,7 @@ jobs:

- bash: |
python .github/assistant.py set-oldest-versions
condition: eq(variables['torch-ver'], '1.10.2')
condition: eq(variables['torch-ver'], '1.10')
displayName: "Setting oldest versions"

- bash: |
Expand All @@ -132,6 +144,21 @@ jobs:
displayName: "Show caches"

- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
Borda marked this conversation as resolved.
Show resolved Hide resolved
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"

- bash: |
df -h .
ls -lh $(CACHED_REFERENCES)
ls -lh tests/
# Check if the file references exists
if [ -f $(CACHED_REFERENCES) ]; then
# Create a directory if it doesn't already exist
Expand All @@ -142,25 +169,12 @@ jobs:
else
echo "The file '$(CACHED_REFERENCES)' does not exist."
fi
du -h --max-depth=1 tests/
timeoutInMinutes: "5"
# if pull request, copy the cache to the tests folder to be used in the next steps
condition: eq(variables['Build.Reason'], 'PullRequest')
continueOnError: "true"
displayName: "Copy/Unzip cached refs"

- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"

- bash: |
wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
unzip -o data.zip
Expand All @@ -169,6 +183,7 @@ jobs:
displayName: "Pull testing data from S3"

- bash: |
du -h --max-depth=1 .
python -m pytest $(TEST_DIRS) \
-m "not DDP" --numprocesses=5 --dist=loadfile \
--cov=torchmetrics --timeout=240 --durations=100 \
Expand All @@ -192,9 +207,10 @@ jobs:
displayName: "UnitTesting DDP"

- bash: |
du -h --max-depth=1 tests/
# archive potentially updated cache to the machine filesystem to be reused with next jobs
zip -q -r $(CACHED_REFERENCES) tests/_cache-references
du -h --max-depth=1 tests/
ls -lh $(CACHED_REFERENCES)
# set as extra step to not pollute general cache when jobs fails or crashes
# so do this update only with successful jobs on master
condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
Expand All @@ -209,7 +225,6 @@ jobs:
python -m coverage xml
python -m codecov --token=$(CODECOV_TOKEN) --name="GPU-coverage" \
--commit=$(Build.SourceVersion) --flags=gpu,unittest --env=linux,azure
ls -l
workingDirectory: "tests/"
# skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
Expand Down
2 changes: 1 addition & 1 deletion requirements/_docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ pydantic > 1.0.0, < 3.0.0
# todo: until this has resolution - https://github.com/sphinx-gallery/sphinx-gallery/issues/1290
# Image
scikit-image ~=0.22; python_version < "3.10"
scikit-image ~=0.24; python_version >= "3.10"
scikit-image ~=0.24; python_version > "3.9" # we do not use `> =` because of oldest replcement
11 changes: 6 additions & 5 deletions requirements/_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment

coverage ==7.6.*
codecov ==2.1.13
pytest ==8.3.*
pytest-cov ==5.0.0
pytest-doctestplus ==1.2.1
Expand All @@ -10,11 +11,11 @@ pytest-timeout ==2.3.1
pytest-xdist ==3.6.1
phmdoctest ==1.4.0

psutil <6.1.0
pyGithub ==2.4.0
fire <=0.6.0
psutil ==6.*
pyGithub >2.0.0, <2.5.0
fire ==0.6.*

cloudpickle >1.3, <=3.0.0
scikit-learn >=1.1.1, <1.3.0; python_version < "3.9"
scikit-learn >=1.4.0, <1.6.0; python_version >= "3.9"
scikit-learn ==1.2.*; python_version < "3.9"
scikit-learn ==1.5.*; python_version > "3.8" # we do not use `> =` because of oldest replcement
cachier ==3.0.1
4 changes: 2 additions & 2 deletions requirements/text.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment

nltk >=3.8.2, <=3.9.1
tqdm >=4.41.0, <4.67.0
nltk >3.8.1, <=3.9.1
tqdm <4.67.0
regex >=2021.9.24, <=2024.7.24
transformers >4.4.0, <4.45.0
mecab-python3 >=1.0.6, <1.1.0
Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/audio/dnsmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class DeepNoiseSuppressionMeanOpinionScore(Metric):
>>> preds = randn(8000)
>>> dnsmos = DeepNoiseSuppressionMeanOpinionScore(8000, False)
>>> dnsmos(preds)
tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64)
tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64)

"""

Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/audio/stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class ShortTimeObjectiveIntelligibility(Metric):
>>> target = randn(8000)
>>> stoi = ShortTimeObjectiveIntelligibility(8000, False)
>>> stoi(preds, target)
tensor(-0.0842)
tensor(-0.084...)

"""

Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/functional/audio/dnsmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def deep_noise_suppression_mean_opinion_score(
>>> from torchmetrics.functional.audio.dnsmos import deep_noise_suppression_mean_opinion_score
>>> preds = randn(8000)
>>> deep_noise_suppression_mean_opinion_score(preds, 8000, False)
tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64)
tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64)

"""
if not _LIBROSA_AVAILABLE or not _ONNXRUNTIME_AVAILABLE or not _REQUESTS_AVAILABLE:
Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/functional/audio/stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def short_time_objective_intelligibility(
>>> preds = randn(8000)
>>> target = randn(8000)
>>> short_time_objective_intelligibility(preds, target, 8000).float()
tensor(-0.0842)
tensor(-0.084...)

"""
if not _PYSTOI_AVAILABLE:
Expand Down
2 changes: 2 additions & 0 deletions tests/unittests/audio/test_stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from torch import Tensor
from torchmetrics.audio import ShortTimeObjectiveIntelligibility
from torchmetrics.functional.audio import short_time_objective_intelligibility
from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_2_0

from unittests import _Input
from unittests._helpers import seed_all
Expand Down Expand Up @@ -120,6 +121,7 @@ def test_error_on_different_shape(metric_class=ShortTimeObjectiveIntelligibility
metric(torch.randn(100), torch.randn(50))


@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_2_0, reason="precision issue with older torch")
def test_on_real_audio():
"""Test that metric works on real audio signal."""
rate, ref = wavfile.read(_SAMPLE_AUDIO_SPEECH)
Expand Down
6 changes: 3 additions & 3 deletions tests/unittests/segmentation/test_generalized_dice_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def _reference_generalized_dice(
],
)
@pytest.mark.parametrize("include_background", [True, False])
class TestMeanDiceScore(MetricTester):
class TestGeneralizedDiceScore(MetricTester):
"""Test class for `MeanIoU` metric."""

@pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
def test_mean_iou_class(self, preds, target, input_format, include_background, ddp):
def test_generalized_dice_class(self, preds, target, input_format, include_background, ddp):
"""Test class implementation of metric."""
self.run_class_metric_test(
ddp=ddp,
Expand All @@ -90,7 +90,7 @@ def test_mean_iou_class(self, preds, target, input_format, include_background, d
},
)

def test_mean_iou_functional(self, preds, target, input_format, include_background):
def test_generalized_dice_functional(self, preds, target, input_format, include_background):
"""Test functional implementation of metric."""
self.run_functional_metric_test(
preds=preds,
Expand Down
Loading