Skip to content

Commit

Permalink
ci: redo HF cashing (#2350)
Browse files Browse the repository at this point in the history
* ci: redo HF cashing

* show py-tree

* drop CUDA_LAUNCH_BLOCKING

* missing torch.use_deterministic_algorithms(False)

* reruns 3
  • Loading branch information
Borda authored Feb 6, 2024
1 parent a7424fd commit b187bfd
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 64 deletions.
4 changes: 3 additions & 1 deletion .azure/gpu-integrations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ jobs:
DEVICES: $( python -c 'name = "$(Agent.Name)" ; gpus = name.split("_")[-1] if "_" in name else "0,1"; print(gpus)' )
# these two caches assume to run repetitively on the same set of machines
TORCH_HOME: "/var/tmp/torch"
HF_HOME: "/var/tmp/huggingface"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
container:
image: "$(docker-image)"
Expand Down
19 changes: 8 additions & 11 deletions .azure/gpu-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ jobs:
# see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
TORCH_HOME: "/var/tmp/torch"
TOKENIZERS_PARALLELISM: "false"
HF_HOME: "/var/tmp/huggingface"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
# MKL_THREADING_LAYER: "GNU"
MKL_SERVICE_FORCE_INTEL: 1
Expand Down Expand Up @@ -105,12 +107,9 @@ jobs:
displayName: "Sanity check"
- bash: |
printf "cache location: $(TORCH_HOME)\n"
mkdir -p $(TORCH_HOME) # in case cache was void
ls -lh $(TORCH_HOME)
printf "cache location: $(HF_HOME)\n"
mkdir -p $(HF_HOME) # in case cache was void
ls -lh $(HF_HOME)
pip install -q py-tree
py-tree /var/tmp/torch
py-tree /var/tmp/hf
displayName: "Show caches"
- bash: |
Expand All @@ -133,9 +132,8 @@ jobs:
- bash: |
python -m pytest unittests -v \
-m "not DDP" --numprocesses=5 --dist=loadfile \
--cov=torchmetrics --timeout=240 --durations=500
env:
CUDA_LAUNCH_BLOCKING: "1"
--cov=torchmetrics --timeout=240 --durations=200 \
--reruns 3 --reruns-delay 1
workingDirectory: tests
displayName: "UnitTesting common"
Expand All @@ -145,7 +143,6 @@ jobs:
--cov=torchmetrics --timeout=240 --durations=500
env:
USE_PYTEST_POOL: "1"
CUDA_LAUNCH_BLOCKING: "1"
workingDirectory: tests
displayName: "UnitTesting DDP"
Expand Down
49 changes: 21 additions & 28 deletions .github/actions/pull-caches/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ inputs:
description: location of local PyPI cache
required: false
default: "_ci-cache_PyPI"
pypi-key:
description: cache restore/dump key
required: false
default: "pypi-packages"

runs:
using: "composite"
Expand All @@ -38,18 +42,22 @@ runs:
- name: Define caches
id: cache_dirs
run: |
torch_cache=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache_pytorch'))")
echo "TORCH_HOME=$torch_cache" >> $GITHUB_ENV
hf_cache=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache_huggingface'))")
echo "HF_HOME=$hf_cache" >> $GITHUB_ENV
cache_dir=$(python -c "import os ; print(os.path.join(os.getcwd(), '_ci-cache'))")
echo "CACHES_DIR=${cache_dir}" >> $GITHUB_ENV
dir_sep=$(python -c "import os ; print(os.path.sep)")
echo "TORCH_HOME=${cache_dir}${dir_sep}torch" >> $GITHUB_ENV
echo "TRANSFORMERS_CACHE=${cache_dir}${dir_sep}transformers" >> $GITHUB_ENV
echo "HF_HOME=${cache_dir}${dir_sep}hf-home" >> $GITHUB_ENV
echo "HF_HUB_CACHE=${cache_dir}${dir_sep}hf-hub" >> $GITHUB_ENV
shell: bash

- name: Cache pip
continue-on-error: true
uses: actions/cache/restore@v3
with:
enableCrossOsArchive: true
path: ${{ inputs.pypi-dir }}
key: pypi-packages
key: ${{ inputs.pypi-key }}

- name: Restored Packages
run: |
Expand All @@ -58,32 +66,17 @@ runs:
ls -lh ${{ inputs.pypi-dir }}
shell: bash

- name: Cache Torch
continue-on-error: true
uses: actions/cache/restore@v3
with:
path: ${{ env.TORCH_HOME }}
key: cache-pytorch

- name: Restored PT
if: ${{ runner.os == 'Linux' }}
run: |
mkdir -p $TORCH_HOME
printf "list $TORCH_HOME:\n"
sudo apt install -q -y tree
tree -h $TORCH_HOME
shell: bash

- name: Cache HF
- name: Cache Torch & HF
continue-on-error: true
uses: actions/cache/restore@v3
with:
path: ${{ env.HF_HOME }}
key: cache-transformers
enableCrossOsArchive: true
path: ${{ env.CACHES_DIR }}
key: ci-caches

- name: Restored HF
- name: Restored Torch & HF
run: |
mkdir -p $HF_HOME
printf "list $HF_HOME:\n"
ls -lh $HF_HOME
mkdir -p $CACHES_DIR
pip install -q py-tree
py-tree $CACHES_DIR
shell: bash
33 changes: 10 additions & 23 deletions .github/actions/push-caches/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ runs:
ls -lh .pip-wheels
shell: bash

- name: Cache pull
- name: Cache pull packages
uses: actions/cache/restore@v3
with:
enableCrossOsArchive: true
path: ${{ inputs.pypi-dir }}
key: ${{ inputs.pypi-key }}

Expand All @@ -69,36 +70,22 @@ runs:
if: ${{ steps.wheels-diff.outputs.count-new != 0 }}
shell: bash

- name: Cache push
- name: Cache push packages
if: ${{ steps.wheels-diff.outputs.count-new != 0 }}
uses: actions/cache/save@v3
with:
enableCrossOsArchive: true
path: ${{ inputs.pypi-dir }}
key: ${{ inputs.pypi-key }}

- name: Post PT
if: ${{ runner.os == 'Linux' }}
run: |
printf "list $TORCH_HOME:\n"
tree -h $TORCH_HOME
shell: bash

- name: Cache Torch
continue-on-error: true
uses: actions/cache/save@v3
with:
path: ${{ env.TORCH_HOME }}
key: cache-pytorch

- name: Post HF
run: |
printf "list $HF_HOME:\n"
ls -lh $HF_HOME
- name: Post Torch & HF
run: py-tree $CACHES_DIR
shell: bash

- name: Cache HF
- name: Cache Torch & HF
continue-on-error: true
uses: actions/cache/save@v3
with:
path: ${{ env.HF_HOME }}
key: cache-transformers
enableCrossOsArchive: true
path: ${{ env.CACHES_DIR }}
key: ci-caches
3 changes: 2 additions & 1 deletion tests/unittests/utilities/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_recursive_allclose(inputs, expected):
def test_cumsum_still_not_supported():
"""Make sure that cumsum on gpu and deterministic mode still fails.
If this test begins to passes, it means newer Pytorch versions support this and we can drop internal support.
If this test begins to pass, it means newer Pytorch versions support this and we can drop internal support.
"""
torch.use_deterministic_algorithms(True)
Expand All @@ -207,6 +207,7 @@ def test_custom_cumsum():
res = _cumsum(x, dim=0).cpu()
else:
res = _cumsum(x, dim=0).cpu()
torch.use_deterministic_algorithms(False)
res2 = np.cumsum(x.cpu(), axis=0)
assert torch.allclose(res, res2)

Expand Down

0 comments on commit b187bfd

Please sign in to comment.