From 3e890d700601d0d96ac842e137190aefc7514119 Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 3 Sep 2024 08:59:01 +0200 Subject: [PATCH] ci/gpu: debug skipped cache (#2709) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .azure/gpu-nuke-cache.yml | 56 +++++++++++++++++++ .azure/gpu-unittests.yml | 47 ++++++++++------ requirements/_docs.txt | 2 +- requirements/_tests.txt | 11 ++-- requirements/text.txt | 4 +- src/torchmetrics/audio/dnsmos.py | 2 +- src/torchmetrics/audio/stoi.py | 2 +- src/torchmetrics/functional/audio/dnsmos.py | 2 +- src/torchmetrics/functional/audio/stoi.py | 2 +- tests/unittests/audio/test_stoi.py | 2 + .../test_generalized_dice_score.py | 6 +- 11 files changed, 105 insertions(+), 31 deletions(-) create mode 100644 .azure/gpu-nuke-cache.yml diff --git a/.azure/gpu-nuke-cache.yml b/.azure/gpu-nuke-cache.yml new file mode 100644 index 00000000000..f8f758ce8c0 --- /dev/null +++ b/.azure/gpu-nuke-cache.yml @@ -0,0 +1,56 @@ +trigger: + tags: + include: + - "*" +# run every month to sanitatize dev environment +schedules: + - cron: "0 0 1 * *" + displayName: Monthly nuke caches + branches: + include: + - master +# run on PR changing only this file +pr: + branches: + include: + - master + paths: + include: + - .azure/gpu-nuke-cache.yml + +jobs: + - job: nuke_caches + # how long to run the job before automatically cancelling + timeoutInMinutes: "10" + # how much time to give 'run always even if cancelled tasks' before stopping them + cancelTimeoutInMinutes: "2" + + pool: "lit-rtx-3090" + + variables: + # these two caches assume to run repetitively on the same set of machines + # see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481 + TORCH_HOME: "/var/tmp/torch" + TRANSFORMERS_CACHE: "/var/tmp/hf/transformers" + HF_HOME: "/var/tmp/hf/home" + HF_HUB_CACHE: "/var/tmp/hf/hub" + PIP_CACHE_DIR: "/var/tmp/pip" + CACHED_REFERENCES: "/var/tmp/cached-references.zip" + + container: + image: "ubuntu:22.04" + options: "-v /var/tmp:/var/tmp" + + steps: + - bash: | + set -ex + rm -rf $(TORCH_HOME) + rm -rf $(TRANSFORMERS_CACHE) + rm -rf $(HF_HOME) + rm -rf $(HF_HUB_CACHE) + rm -rf $(PIP_CACHE_DIR) + rm -rf $(CACHED_REFERENCES) + displayName: "delete all caches" + - bash: | + ls -lh /var/tmp + displayName: "show tmp/ folder" diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml index 25dc593558d..78dc6beee86 100644 --- a/.azure/gpu-unittests.yml +++ b/.azure/gpu-unittests.yml @@ -9,6 +9,13 @@ trigger: - master - release/* - refs/tags/* +# run every month to populate caches +schedules: + - cron: "0 1 1 * *" + displayName: Monthly re-build caches + branches: + include: + - master pr: - master - release/* @@ -67,6 +74,11 @@ jobs: CUDA_version_mm="${CUDA_version//'.'/''}" echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm" echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html" + mkdir -p $(TORCH_HOME) + mkdir -p $(TRANSFORMERS_CACHE) + mkdir -p $(HF_HOME) + mkdir -p $(HF_HUB_CACHE) + mkdir -p $(PIP_CACHE_DIR) displayName: "set Env. vars" - bash: | echo "##vso[task.setvariable variable=ALLOW_SKIP_IF_OUT_OF_MEMORY]1" @@ -111,7 +123,7 @@ jobs: - bash: | python .github/assistant.py set-oldest-versions - condition: eq(variables['torch-ver'], '1.10.2') + condition: eq(variables['torch-ver'], '1.10') displayName: "Setting oldest versions" - bash: | @@ -132,6 +144,21 @@ jobs: displayName: "Show caches" - bash: | + python -m pytest torchmetrics --cov=torchmetrics \ + --timeout=240 --durations=50 \ + --reruns 2 --reruns-delay 1 + # --numprocesses=5 --dist=loadfile + env: + DOCTEST_DOWNLOAD_TIMEOUT: "180" + SKIP_SLOW_DOCTEST: "1" + workingDirectory: "src/" + timeoutInMinutes: "40" + displayName: "DocTesting" + + - bash: | + df -h . + ls -lh $(CACHED_REFERENCES) + ls -lh tests/ # Check if the file references exists if [ -f $(CACHED_REFERENCES) ]; then # Create a directory if it doesn't already exist @@ -142,25 +169,12 @@ jobs: else echo "The file '$(CACHED_REFERENCES)' does not exist." fi - du -h --max-depth=1 tests/ timeoutInMinutes: "5" # if pull request, copy the cache to the tests folder to be used in the next steps condition: eq(variables['Build.Reason'], 'PullRequest') continueOnError: "true" displayName: "Copy/Unzip cached refs" - - bash: | - python -m pytest torchmetrics --cov=torchmetrics \ - --timeout=240 --durations=50 \ - --reruns 2 --reruns-delay 1 - # --numprocesses=5 --dist=loadfile - env: - DOCTEST_DOWNLOAD_TIMEOUT: "180" - SKIP_SLOW_DOCTEST: "1" - workingDirectory: "src/" - timeoutInMinutes: "40" - displayName: "DocTesting" - - bash: | wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip unzip -o data.zip @@ -169,6 +183,7 @@ jobs: displayName: "Pull testing data from S3" - bash: | + du -h --max-depth=1 . python -m pytest $(TEST_DIRS) \ -m "not DDP" --numprocesses=5 --dist=loadfile \ --cov=torchmetrics --timeout=240 --durations=100 \ @@ -192,9 +207,10 @@ jobs: displayName: "UnitTesting DDP" - bash: | + du -h --max-depth=1 tests/ # archive potentially updated cache to the machine filesystem to be reused with next jobs zip -q -r $(CACHED_REFERENCES) tests/_cache-references - du -h --max-depth=1 tests/ + ls -lh $(CACHED_REFERENCES) # set as extra step to not pollute general cache when jobs fails or crashes # so do this update only with successful jobs on master condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest')) @@ -209,7 +225,6 @@ jobs: python -m coverage xml python -m codecov --token=$(CODECOV_TOKEN) --name="GPU-coverage" \ --commit=$(Build.SourceVersion) --flags=gpu,unittest --env=linux,azure - ls -l workingDirectory: "tests/" # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' condition: and(succeeded(), ne(variables['TEST_DIRS'], '')) diff --git a/requirements/_docs.txt b/requirements/_docs.txt index 1752cd11dc2..d2e17eb64ce 100644 --- a/requirements/_docs.txt +++ b/requirements/_docs.txt @@ -29,4 +29,4 @@ pydantic > 1.0.0, < 3.0.0 # todo: until this has resolution - https://github.com/sphinx-gallery/sphinx-gallery/issues/1290 # Image scikit-image ~=0.22; python_version < "3.10" -scikit-image ~=0.24; python_version >= "3.10" +scikit-image ~=0.24; python_version > "3.9" # we do not use `> =` because of oldest replcement diff --git a/requirements/_tests.txt b/requirements/_tests.txt index 1aeb982ee9e..7708dfa0f3e 100644 --- a/requirements/_tests.txt +++ b/requirements/_tests.txt @@ -2,6 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment coverage ==7.6.* +codecov ==2.1.13 pytest ==8.3.* pytest-cov ==5.0.0 pytest-doctestplus ==1.2.1 @@ -10,11 +11,11 @@ pytest-timeout ==2.3.1 pytest-xdist ==3.6.1 phmdoctest ==1.4.0 -psutil <6.1.0 -pyGithub ==2.4.0 -fire <=0.6.0 +psutil ==6.* +pyGithub >2.0.0, <2.5.0 +fire ==0.6.* cloudpickle >1.3, <=3.0.0 -scikit-learn >=1.1.1, <1.3.0; python_version < "3.9" -scikit-learn >=1.4.0, <1.6.0; python_version >= "3.9" +scikit-learn ==1.2.*; python_version < "3.9" +scikit-learn ==1.5.*; python_version > "3.8" # we do not use `> =` because of oldest replcement cachier ==3.0.1 diff --git a/requirements/text.txt b/requirements/text.txt index 65396b65451..abdfe6808a7 100644 --- a/requirements/text.txt +++ b/requirements/text.txt @@ -1,8 +1,8 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -nltk >=3.8.2, <=3.9.1 -tqdm >=4.41.0, <4.67.0 +nltk >3.8.1, <=3.9.1 +tqdm <4.67.0 regex >=2021.9.24, <=2024.7.24 transformers >4.4.0, <4.45.0 mecab-python3 >=1.0.6, <1.1.0 diff --git a/src/torchmetrics/audio/dnsmos.py b/src/torchmetrics/audio/dnsmos.py index d164721f167..74d035a7fd4 100644 --- a/src/torchmetrics/audio/dnsmos.py +++ b/src/torchmetrics/audio/dnsmos.py @@ -79,7 +79,7 @@ class DeepNoiseSuppressionMeanOpinionScore(Metric): >>> preds = randn(8000) >>> dnsmos = DeepNoiseSuppressionMeanOpinionScore(8000, False) >>> dnsmos(preds) - tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64) + tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64) """ diff --git a/src/torchmetrics/audio/stoi.py b/src/torchmetrics/audio/stoi.py index cf5d5204f77..253dab3ea38 100644 --- a/src/torchmetrics/audio/stoi.py +++ b/src/torchmetrics/audio/stoi.py @@ -69,7 +69,7 @@ class ShortTimeObjectiveIntelligibility(Metric): >>> target = randn(8000) >>> stoi = ShortTimeObjectiveIntelligibility(8000, False) >>> stoi(preds, target) - tensor(-0.0842) + tensor(-0.084...) """ diff --git a/src/torchmetrics/functional/audio/dnsmos.py b/src/torchmetrics/functional/audio/dnsmos.py index 91c69de7a2b..9b0dca883db 100644 --- a/src/torchmetrics/functional/audio/dnsmos.py +++ b/src/torchmetrics/functional/audio/dnsmos.py @@ -218,7 +218,7 @@ def deep_noise_suppression_mean_opinion_score( >>> from torchmetrics.functional.audio.dnsmos import deep_noise_suppression_mean_opinion_score >>> preds = randn(8000) >>> deep_noise_suppression_mean_opinion_score(preds, 8000, False) - tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64) + tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64) """ if not _LIBROSA_AVAILABLE or not _ONNXRUNTIME_AVAILABLE or not _REQUESTS_AVAILABLE: diff --git a/src/torchmetrics/functional/audio/stoi.py b/src/torchmetrics/functional/audio/stoi.py index 91d09cc64c3..48e9e78510b 100644 --- a/src/torchmetrics/functional/audio/stoi.py +++ b/src/torchmetrics/functional/audio/stoi.py @@ -64,7 +64,7 @@ def short_time_objective_intelligibility( >>> preds = randn(8000) >>> target = randn(8000) >>> short_time_objective_intelligibility(preds, target, 8000).float() - tensor(-0.0842) + tensor(-0.084...) """ if not _PYSTOI_AVAILABLE: diff --git a/tests/unittests/audio/test_stoi.py b/tests/unittests/audio/test_stoi.py index 54374098779..2d872507401 100644 --- a/tests/unittests/audio/test_stoi.py +++ b/tests/unittests/audio/test_stoi.py @@ -20,6 +20,7 @@ from torch import Tensor from torchmetrics.audio import ShortTimeObjectiveIntelligibility from torchmetrics.functional.audio import short_time_objective_intelligibility +from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_2_0 from unittests import _Input from unittests._helpers import seed_all @@ -120,6 +121,7 @@ def test_error_on_different_shape(metric_class=ShortTimeObjectiveIntelligibility metric(torch.randn(100), torch.randn(50)) +@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_2_0, reason="precision issue with older torch") def test_on_real_audio(): """Test that metric works on real audio signal.""" rate, ref = wavfile.read(_SAMPLE_AUDIO_SPEECH) diff --git a/tests/unittests/segmentation/test_generalized_dice_score.py b/tests/unittests/segmentation/test_generalized_dice_score.py index f7e43ef2b56..3f8acec842a 100644 --- a/tests/unittests/segmentation/test_generalized_dice_score.py +++ b/tests/unittests/segmentation/test_generalized_dice_score.py @@ -66,11 +66,11 @@ def _reference_generalized_dice( ], ) @pytest.mark.parametrize("include_background", [True, False]) -class TestMeanDiceScore(MetricTester): +class TestGeneralizedDiceScore(MetricTester): """Test class for `MeanIoU` metric.""" @pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False]) - def test_mean_iou_class(self, preds, target, input_format, include_background, ddp): + def test_generalized_dice_class(self, preds, target, input_format, include_background, ddp): """Test class implementation of metric.""" self.run_class_metric_test( ddp=ddp, @@ -90,7 +90,7 @@ def test_mean_iou_class(self, preds, target, input_format, include_background, d }, ) - def test_mean_iou_functional(self, preds, target, input_format, include_background): + def test_generalized_dice_functional(self, preds, target, input_format, include_background): """Test functional implementation of metric.""" self.run_functional_metric_test( preds=preds,