embeddings-benchmark · isaac-chung · Mar 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -29,12 +29,12 @@
 - [ ] I have checked that the performance is neither trivial (both models gain close to perfect scores) nor random (both models gain close to random scores).
 - [ ] If the dataset is too big (e.g. >2048 examples), considering using `self.stratified_subsampling() under dataset_transform()`
 - [ ] I have filled out the metadata object in the dataset file (find documentation on it [here](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_dataset.md#2-creating-the-metadata-object)).
-- [ ] Run tests locally to make sure nothing is broken using `make test`. 
-- [ ] Run the formatter to format the code using `make lint`. 
+- [ ] Run tests locally to make sure nothing is broken using `make test`.
+- [ ] Run the formatter to format the code using `make lint`.
 
 
 ### Adding a model checklist
-<!-- 
+<!--
 When adding a model to the model registry
 see also https://github.com/embeddings-benchmark/mteb/blob/main/docs/reproducible_workflow.md
 -->
@@ -43,4 +43,4 @@ see also https://github.com/embeddings-benchmark/mteb/blob/main/docs/reproducibl
  - [ ] I have ensured that my model can be loaded using
    - [ ] `mteb.get_model(model_name, revision)` and
    - [ ] `mteb.get_model_meta(model_name, revision)`
- - [ ] I have tested the implementation works on a representative set of tasks.
+ - [ ] I have tested the implementation works on a representative set of tasks.
diff --git a/.github/workflows/dataset_loading.yml b/.github/workflows/dataset_loading.yml
@@ -0,0 +1,27 @@
+name: Datasets available on HuggingFace
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  extract-and-run:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+        cache: 'pip'
+
+    - name: Install dependencies
+      run: |
+        make install-for-tests
+    - name: Run dataset loading tests
+      run: |
+        make dataset-load-test
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -47,7 +47,7 @@ jobs:
       - name: Create table
         run: |
           make build-docs
-      
+
       - name: Push table
         run: |
           git config --global user.email "github-actions[bot]@users.noreply.github.com"
@@ -60,4 +60,3 @@ jobs:
             git commit -m "Update tasks table"
             git push
           fi
-
diff --git a/.github/workflows/leaderboard_refresh.yaml b/.github/workflows/leaderboard_refresh.yaml
@@ -2,8 +2,8 @@ name: Daily Space Rebuild
 on:
   schedule:
     # Runs at midnight Pacific Time (8 AM UTC)
-    - cron: '0 8 * * *'
-  workflow_dispatch:  # Allows manual triggering
+    - cron: "0 8 * * *"
+  workflow_dispatch: # Allows manual triggering
 
 jobs:
   rebuild:
@@ -12,5 +12,5 @@ jobs:
       - name: Trigger Factory Rebuild
         run: |
           curl -X POST \
-            "https://huggingface.co/api/spaces/mteb/leaderboard_2_demo/restart?factory=true" \
+            "https://huggingface.co/api/spaces/mteb/leaderboard/restart?factory=true" \
             -H "Authorization: Bearer ${{ secrets.HF_TOKEN }}"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -25,4 +25,3 @@ jobs:
         id: lint
         run: |
           make lint-check
-
diff --git a/.github/workflows/model_loading.yml b/.github/workflows/model_loading.yml
@@ -3,22 +3,22 @@ name: Model Loading
 on:
   pull_request:
     paths:
-      - 'mteb/models/**.py'
+      - "mteb/models/**.py"
 
 jobs:
   extract-and-run:
     runs-on: ubuntu-latest
 
     steps:
-    - name: Checkout repository
-      uses: actions/checkout@v3
+      - name: Checkout repository
+        uses: actions/checkout@v3
 
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.10'
-        cache: 'pip'
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: "pip"
 
-    - name: Install dependencies and run tests
-      run: |
-        make model-load-test BASE_BRANCH=${{ github.event.pull_request.base.ref }}
+      - name: Install dependencies and run tests
+        run: |
+          make model-load-test BASE_BRANCH=${{ github.event.pull_request.base.ref }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -20,8 +20,7 @@ jobs:
     runs-on: ubuntu-latest
     concurrency: release
     permissions:
-      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing using PyPI 
-
+      id-token: write # IMPORTANT: this permission is mandatory for trusted publishing using PyPI
 
     if: ${{ github.ref == 'refs/heads/main' && github.event.workflow_run.conclusion == 'success'}}
     steps:
@@ -40,8 +39,8 @@ jobs:
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
         if: steps.release.outputs.released == 'true'
-        # This action supports PyPI's trusted publishing implementation, which allows authentication to PyPI without a manually 
-        # configured API token or username/password combination. To perform trusted publishing with this action, your project's 
+        # This action supports PyPI's trusted publishing implementation, which allows authentication to PyPI without a manually
+        # configured API token or username/password combination. To perform trusted publishing with this action, your project's
         # publisher must already be configured on PyPI.
 
       - name: Publish package distributions to GitHub Releases

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -2,7 +2,6 @@
 # 1) install Python dependencies
 # 2) run make test
 
-
 name: Test
 on:
   push:
@@ -30,7 +29,7 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
           cache: "pip"
-        
+
       - name: Install dependencies
         shell: bash
         run: |
@@ -53,4 +52,3 @@ jobs:
           # if it fails again, the workflow will fail.
           # If it passes the first time the test will not run again
           make test || make test
-
diff --git a/.gitignore b/.gitignore
@@ -151,4 +151,4 @@ model_names.txt
 mteb/leaderboard/__cached_results.json
 
 # gradio
-.gradio/
+.gradio/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,31 @@
+fail_fast: true
+
+repos:
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.23
+    hooks:
+      - id: validate-pyproject
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.3.0
+    hooks:
+    -   id: check-yaml
+    -   id: check-json
+    -   id: pretty-format-json
+        args:
+          - "--autofix"
+          - "--indent=4"
+          - "--no-sort-keys"
+    -   id: end-of-file-fixer # generated a lot of changes
+    -   id: trailing-whitespace
+    -   id: check-toml
+
+  - repo: local
+    hooks:
+      - id: lint
+        name: lint
+        description: "Run 'make lint'"
+        entry: make lint
+        language: python
+        types_or: [python]
+        minimum_pre_commit_version: "2.9.2"
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -2,4 +2,4 @@
     "recommendations": [
         "charliermarsh.ruff"
     ]
-}
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,5 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.defaultFormatter": "charliermarsh.ruff"
 }
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,10 +1,8 @@
 ## Contributing to MTEB
-We welcome contributions such as new datasets to MTEB! Please see detailed see the related [issue](https://github.com/embeddings-benchmark/mteb/issues/360) for more information. 
-
-Once you have decided on your contribution, this document describes how to set up the repository for development.
+We welcome contributions. Please see the current open issues or open an issue yourself. Once you have decided on what you'd like to contribute, this document describes how to set up the repository for development.
 
 ### Development Installation
-If you want to submit a dataset or on other ways contribute to MTEB, you can install the package in development mode:
+If you want to submit a dataset or in other ways contribute to MTEB, you can install the package in development mode:
 
 ```bash
 git clone https://github.com/embeddings-benchmark/mteb
@@ -21,10 +19,10 @@ To run the tests, you can use the following command:
 make test
 ```
 
-This is also run by the CI pipeline, so you can be sure that your changes do not break the package. We recommend running the tests in the lowest version of python supported by the package (see the pyproject.toml) to ensure compatibility.
+This is also run by the CI pipeline, so you can be sure that your changes do not break the package. We recommend running the tests in the lowest version of Python supported by the package (see the pyproject.toml) to ensure compatibility.
 
 ### Running linting
-To run the linting before a PR you can use the following command:
+To run the linting before a PR, you can use the following command:
 
 ```bash
 make lint
@@ -33,12 +31,12 @@ make lint
 This command is equivalent to the command run during CI. It will check for code style and formatting issues.
 
 ## Semantic Versioning and Releases
-MTEB follows [semantic versioning](https://semver.org/). This means that the version number of the package is composed of three numbers: `MAJOR.MINOR.PATCH`. This allow us to use existing tools to automatically manage the versioning of the package. For maintainers (and contributors), this means that commits with the following prefixes will automatically trigger a version bump:
+MTEB follows [semantic versioning](https://semver.org/). This means that the version number of the package is composed of three numbers: `MAJOR.MINOR.PATCH`. This allows us to use existing tools to manage the versioning of the package automatically. For maintainers (and contributors), this means that commits with the following prefixes will automatically trigger a version bump:
 
 - `fix:` for patches
 - `feat:` for minor versions
 - `breaking:` for major versions
 
-Any commit with one of these prefixes will trigger a version bump upon merging to the main branch as long as tests pass. A version bump will then trigger a new release on PyPI as well as a new release on GitHub.
+Any commit with one of these prefixes will trigger a version bump upon merging to the main branch, as long as the tests pass. A version bump will then trigger a new release on PyPI as well as a new release on GitHub.
 
-Other prefixes will not trigger a version bump. For example, `docs:`, `chore:`, `refactor:`, etc., however they will structure the commit history and the changelog. You can find more information about this in the [python-semantic-release documentation](https://python-semantic-release.readthedocs.io/en/latest/). If you do not intend to trigger a version bump you're not required to follow this convention when contributing to MTEB.
+Other prefixes will not trigger a version bump. For example, `docs:`, `chore:`, `refactor:`, etc., however they will structure the commit history and the changelog. You can find more information about this in the [python-semantic-release documentation](https://python-semantic-release.readthedocs.io/en/latest/). If you do not intend to trigger a version bump, you're not required to follow this convention when contributing to MTEB.
diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 install:
 	@echo "--- 🚀 Installing project dependencies ---"
 	pip install -e ".[dev]"
+	pre-commit install
 
 install-for-tests:
 	@echo "--- 🚀 Installing project dependencies for test ---"
@@ -10,7 +11,7 @@ install-for-tests:
 lint:
 	@echo "--- 🧹 Running linters ---"
 	ruff format . 			# running ruff formatting
-	ruff check . --fix  	# running ruff linting
+	ruff check . --fix --exit-non-zero-on-fix  	# running ruff linting # --exit-non-zero-on-fix is used for the pre-commit hook to work
 
 lint-check:
 	@echo "--- 🧹 Check is project is linted ---"
@@ -20,11 +21,12 @@ lint-check:
 
 test:
 	@echo "--- 🧪 Running tests ---"
-	pytest -n auto --durations=5
+	pytest -n auto -m "not test_datasets"
+
 
 test-with-coverage:
 	@echo "--- 🧪 Running tests with coverage ---"
-	pytest -n auto --durations=5 --cov-report=term-missing --cov-config=pyproject.toml --cov=mteb
+	pytest -n auto --cov-report=term-missing --cov-config=pyproject.toml --cov=mteb
 
 pr:
 	@echo "--- 🚀 Running requirements for a PR ---"
@@ -42,4 +44,20 @@ model-load-test:
 	@echo "--- 🚀 Running model load test ---"
 	pip install ".[dev, speedtask, pylate,gritlm,xformers,model2vec]"
 	python scripts/extract_model_names.py $(BASE_BRANCH) --return_one_model_name_per_file
-	python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt
+	python tests/test_models/model_loading.py --model_name_file scripts/model_names.txt
+
+
+dataset-load-test:
+	@echo "--- 🚀 Running dataset load test ---"
+	pytest -n auto -m test_datasets
+
+
+run-leaderboard:
+	@echo "--- 🚀 Running leaderboard locally ---"
+	python -m mteb.leaderboard.app
+
+
+.PHONY: check
+check: ## Run code quality tools.
+	@echo "--- 🧹 Running code quality tools ---"
+	@pre-commit run -a
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,4 +25,3 @@ jobs:
		id: lint
		run: \|
		make lint-check
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,4 @@ @@
         "recommendations": [
             "charliermarsh.ruff"
         ]
-    }
+    }