diff --git a/.github/workflows/dataset_loading.yml b/.github/workflows/dataset_loading.yml index ab317db0aa..713725989e 100644 --- a/.github/workflows/dataset_loading.yml +++ b/.github/workflows/dataset_loading.yml @@ -13,6 +13,13 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 + - name: Cache Hugging Face + id: cache-hf + uses: actions/cache@v4 + with: + key: cache-dataset-loading + path: ${{ github.workspace }}/.cache/dataset_check_cache.json + - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/mteb/models/ops_moa_models.py b/mteb/models/ops_moa_models.py index 965b5800a2..9d2f12c40a 100644 --- a/mteb/models/ops_moa_models.py +++ b/mteb/models/ops_moa_models.py @@ -1,12 +1,11 @@ from __future__ import annotations -from mteb.model_meta import ModelMeta -from mteb.models.wrapper import Wrapper from functools import partial + from sentence_transformers import SentenceTransformer -import torch -import torch.nn as nn -from huggingface_hub import snapshot_download + +from mteb.model_meta import ModelMeta +from mteb.models.wrapper import Wrapper class CustomWrapper(Wrapper): diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index c030c2c906..e58e8be254 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -1,6 +1,9 @@ from __future__ import annotations +import json import logging +from datetime import datetime +from pathlib import Path from unittest.mock import Mock, patch import huggingface_hub @@ -75,9 +78,26 @@ def test_load_data( ) @pytest.mark.parametrize("dataset_revision", dataset_revisions) def test_dataset_on_hf(dataset_revision: tuple[str, str]): + CACHE_FILE = Path("./.cache/dataset_check_cache.json") repo_id, revision = dataset_revision + today = datetime.now().strftime("%Y-%m-%d") + repo_key = repo_id + "-" + revision + + if CACHE_FILE.exists(): + with CACHE_FILE.open("r") as f: + cache = json.load(f) + else: + cache = {} + + if cache.get(repo_key) == {"repo_id": repo_id, "revision": revision, "date": today}: + pytest.skip(f"Dataset {repo_id} - {revision} already checked today") + try: huggingface_hub.dataset_info(repo_id, revision=revision) + + cache[repo_key] = {"repo_id": repo_id, "revision": revision, "date": today} + with CACHE_FILE.open("w") as f: + json.dump(cache, f) except ( huggingface_hub.errors.RepositoryNotFoundError, huggingface_hub.errors.RevisionNotFoundError,