Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
143 commits
Select commit Hold shift + click to select a range
32b7af8
Started the following:
sufen-f Feb 19, 2025
8eff2c6
Minor changes and linted files. #2093
sufen-f Feb 19, 2025
53a2e36
Minor changes and linted files. #2093
sufen-f Feb 20, 2025
ed93f2b
Minor changes and linted files. #2093
sufen-f Feb 20, 2025
fbab033
Refs #2068: Initial Implementation of audio-text retrieval abstask an…
imadtyx Feb 20, 2025
d39e187
Added MockAudioClustering task + MockAudioEncoder for testcase
alisartazkhan Feb 20, 2025
bcca37f
MockAudioClustering + MockAudioEncoder (#2093)
Feb 20, 2025
2a238ed
Added wav2vec model wrapper
alisartazkhan Feb 22, 2025
7816974
Added subTask with small sample of dataset for testing
Feb 22, 2025
07f53b1
Added four w2v variants
alisartazkhan Feb 23, 2025
882af38
Update wav2vec_models.py
alisartazkhan Feb 23, 2025
daeada0
Added wav2vec (5), wavlm (7), and whisper (5) models
alisartazkhan Feb 23, 2025
c1ebf2a
Added revisions from HF to wav2vec models, added silhouette score, DB…
sufen-f Feb 23, 2025
716deed
Update mteb/models/wavlm_models.py
alisartazkhan Feb 23, 2025
ce1bee9
setting up colab
sufen-f Feb 24, 2025
4cf7e6f
Merge remote-tracking branch 'origin/maeb' into maeb
sufen-f Feb 24, 2025
545b938
added a2a
Feb 24, 2025
ed978fa
PCA + hidden layer + shuffling
Feb 24, 2025
1616ba9
New task: emotion clustering
Feb 24, 2025
ac14d16
Added qwen2 model
alisartazkhan Feb 26, 2025
1302477
Added Wav2Vec model, voice clustering task, VoxCeleb dataset subset (…
sufen-f Feb 28, 2025
4f23fdf
Merge branch 'maeb' into maeb
sufen-f Feb 28, 2025
ee10191
Revert "Maeb - added voice clustering task, wav2vec model and VoxCele…
sufen-f Mar 1, 2025
f1449c0
Revert "Revert "Maeb - added voice clustering task, wav2vec model and…
sufen-f Mar 1, 2025
d731d40
Revert "Revert "Revert "Maeb - added voice clustering task, wav2vec m…
sufen-f Mar 1, 2025
a0de4fc
Add Audio (Multi Label) Classification Abstask, Baseline Audio model,…
anime-sh Mar 4, 2025
0620c58
Add ESC50 and zero-shot classification (#2133)
RahulSChand Mar 5, 2025
6d9eca3
Add unfused clap model for zero-shot (#2269)
RahulSChand Mar 6, 2025
2188585
Add new and complete version of FSD50K multi-label audio classificati…
RahulSChand Mar 8, 2025
bdefb14
added large, music and speech clap models (#2284)
RahulSChand Mar 8, 2025
2e5dc67
add AbsTaskAudioClassification, ESC50 & GunshotTriangulation datasets…
silky1708 Mar 10, 2025
bf9fe16
Add NSynth dataset (#2306)
silky1708 Mar 10, 2025
a94ea50
Add urbansound8k for zero-shot (#2292)
RahulSChand Mar 11, 2025
52a88ae
Add Emotion classification Ravdess dataset (#2320)
RahulSChand Mar 11, 2025
cd07f24
[MAEB] main merge (#2341)
isaac-chung Mar 13, 2025
ef30e3d
adding GTZAN Genre dataset (#2307)
silky1708 Mar 13, 2025
5cf3840
Adding Beijing Opera dataset (#2356)
silky1708 Mar 14, 2025
368e720
update TaskMetadata from mteb:maeb
silky1708 Mar 13, 2025
25136ba
make pr
silky1708 Mar 13, 2025
79e06fe
update ruff to 0.9.7; make lint
silky1708 Mar 13, 2025
f85627f
update TaskMetadata from mteb:maeb
silky1708 Mar 13, 2025
0cf07f4
update TaskMetadata
silky1708 Mar 14, 2025
7460a13
add Mridingham datasets
silky1708 Mar 14, 2025
d5caae6
rm comment
silky1708 Mar 14, 2025
187d7bc
Adding Libricount dataset (#2361)
silky1708 Mar 16, 2025
3bae6b6
Adding Crema-D Dataset for emotion classification [HEAR] (#2368)
silky1708 Mar 16, 2025
307aa57
Adding FSDD dataset (Free Spoken Digit Dataset) (#2371)
silky1708 Mar 16, 2025
6ad0bc2
Add VoxCelebSA, SpokenQAforIC, VehicleSoundClustering from Dynamic-SU…
diffunity Mar 17, 2025
230064a
fix FSD-50K Task Metadata, Label handling and add stratified subsampl…
anime-sh Mar 18, 2025
89ab596
Add music clustering dataset (#2232)
mina-parham Mar 26, 2025
f3a0403
[MAEB] merge main -> maeb (#2471)
isaac-chung Apr 1, 2025
5af86e5
Create AbsTask and Evaluator for audio pair classification task (#2457)
switchpiggy Apr 4, 2025
01c462d
Add Language, Gender, and Age classifcation tasks based on common-la…
anime-sh Apr 4, 2025
5acab7f
Merge main into MAEB (#2488)
isaac-chung Apr 4, 2025
31925c5
added wavlm models (#2472)
alisartazkhan Apr 4, 2025
7e57e9d
Adding SIB-FLEURS (#2357)
diffunity Apr 5, 2025
991a0fc
update wavlm models
alisartazkhan Apr 22, 2025
5fc6e4d
update wavlm models
alisartazkhan Apr 23, 2025
14f6b41
Add files via upload
mnasser3 Apr 29, 2025
9eaca21
Update whisper_models.py license format
mnasser3 Apr 29, 2025
040d5c6
Updated wavlm and whisper models to fit maeb structure (#2572)
alisartazkhan May 2, 2025
aba957c
Delete mteb/abstasks/Image/AbsTaskZeroshotClassification.py
isaac-chung May 3, 2025
2fada5b
[MAEB] Merge in main 20250503 (#2635)
isaac-chung May 3, 2025
4c53823
Added SpeechCommands Dataset (Subset) (#2645)
AdnanElAssadi56 May 6, 2025
804be31
Added ESC50 Clustering Dataset (#2652)
AdnanElAssadi56 May 7, 2025
e1bc62f
Added Qwen2-7b (#2660)
alisartazkhan May 8, 2025
41b4c45
Added the IEMOCAP Datasets (#2640)
AdnanElAssadi56 May 9, 2025
4cd81ce
Add sew-d and unispeech models
sufen-f May 17, 2025
1163e62
Add sew-d and unispeech models
sufen-f May 17, 2025
cef8d57
Merge branch 'model_development' into maeb
sufen-f May 17, 2025
2d25266
Revert "Merge branch 'model_development' into maeb"
sufen-f May 17, 2025
0fb74db
Reapply "Merge branch 'model_development' into maeb"
sufen-f May 17, 2025
a2e6cf2
Revert to 41b4c451d48ca1234b508a5972662dc0c25573fa
sufen-f May 17, 2025
390b867
Add sew-d and unispeech models #2693 #2694 (#2701)
sufen-f May 18, 2025
6f15209
Added Minds14 Dataset (#2644)
AdnanElAssadi56 May 19, 2025
17197e0
Added Hubert Models (#2689)
AdnanElAssadi56 May 23, 2025
ee8e26f
Added AST Model (#2691)
AdnanElAssadi56 May 23, 2025
95a03f7
Added Data2Vec Models (#2690)
AdnanElAssadi56 May 23, 2025
645255b
Adding BirdSet dataset
imadtyx Jun 1, 2025
e067d88
Update __init__.py to include BirdSet dataset(s)
imadtyx Jun 1, 2025
1afb4ac
MAEB: Encodec Model (#2754)
AdnanElAssadi56 Jun 2, 2025
d4b9abd
MAEB: MMS Models (#2750)
AdnanElAssadi56 Jun 2, 2025
cf51d8f
MAEB: Seamlessm4t Model (V2) (#2751)
AdnanElAssadi56 Jun 2, 2025
439ee37
[MAEB] CNN14 Model (PANNs) (#2757)
AdnanElAssadi56 Jun 2, 2025
6e434aa
Added TutAcoustic Scenes Dataset (#2647)
AdnanElAssadi56 Jun 3, 2025
88436e3
MAEB: M-CTC-T Model (#2753)
AdnanElAssadi56 Jun 3, 2025
c5d8484
Added GTZAN Clustering Dataset (#2653)
AdnanElAssadi56 Jun 3, 2025
1af8eb1
Added AmbientAcousticContext Dataset (#2642)
AdnanElAssadi56 Jun 3, 2025
69d67e4
Added Crema_d Dataset (#2651)
AdnanElAssadi56 Jun 3, 2025
cd7c6e9
Added VoxCeleb Clustering Dataset (#2654)
AdnanElAssadi56 Jun 3, 2025
eb173b9
Audio Reranking Abstask+ Evaluator + Mini/Dummy AudioCaps Subset (#2744)
AdnanElAssadi56 Jun 5, 2025
31f38f2
Added 5 datasets for audio pair classification (#2463)
kkaitlyn111 Jun 8, 2025
ece46da
Adds SpokeN-100-English (#2342)
mina-parham Jun 8, 2025
89563e1
Adds VocalSound dataset (#2337)
mina-parham Jun 8, 2025
9114dc6
Added Birdclef Subset Dataset (#2641)
AdnanElAssadi56 Jun 13, 2025
c383316
Merge branch 'maeb' of github.com:embeddings-benchmark/mteb into maeb
isaac-chung Jun 14, 2025
a81eec3
lint
isaac-chung Jun 14, 2025
e990850
Added VoxPopuli Datasets (#2648)
AdnanElAssadi56 Jun 20, 2025
6bc4c5a
added SpeechCommand dataset and Keyword spotting task (#2329)
RahulSChand Jun 21, 2025
bdbe51f
[MAEB] Merge from main up to 1.38.30 (#2840)
isaac-chung Jun 22, 2025
5510897
Added Yamnet and VGGish models (#2687)
ayush1298 Jun 23, 2025
3c464f9
Add urbansound 8k linear probing (#2845)
isaac-chung Jun 23, 2025
a4842d5
add stratified_subsampling to Audio clustering datasets (#2854)
isaac-chung Jun 28, 2025
1453ad6
Audio Reranking Eval Update + 5 Reranking Datasets (#2849)
AdnanElAssadi56 Jun 28, 2025
73c9d2c
[MAEB] Sync with 1.38.33 (#2883)
isaac-chung Jul 6, 2025
8a8a101
MAEB Classification Datasets Downsampling/Formatting + MTEB UPLOAD (#…
AdnanElAssadi56 Jul 9, 2025
c7b8542
Merge main maeb 07 10 (#2894)
Samoed Jul 10, 2025
74bdc03
merge main
Samoed Jul 10, 2025
8f8577f
SibFluers Dataset Multilingual Extention (#2890)
AdnanElAssadi56 Jul 11, 2025
f1eb63c
Implemented Audio Any2AnyRetrieval + 3 Datasets for A2A, A2T, T2A (#2…
kkaitlyn111 Jul 12, 2025
ab0899c
[MAEB] encode() for audio-only models should raise error (#2914)
isaac-chung Jul 18, 2025
f619034
fix: add missing clap model handling
isaac-chung Jul 18, 2025
4e79b1a
dataset: add Clotho by creating the datasets on the fly (#2915)
isaac-chung Jul 20, 2025
6b37b71
dataset: Add SoundDescs (#2911)
isaac-chung Jul 20, 2025
a19e7b4
Audio Retrieval Dataset: UrbanSound8K (#2920)
AdnanElAssadi56 Jul 21, 2025
698500d
Audio Retrieval Dataset: MACS (#2921)
AdnanElAssadi56 Jul 21, 2025
ca4b73c
SpeechT5 Model (#2901)
AdnanElAssadi56 Jul 21, 2025
6671fcc
MAEB Model MSCLAP (#2902)
AdnanElAssadi56 Jul 21, 2025
dd6a76a
MAEB Model Wav2Clip (#2908)
AdnanElAssadi56 Jul 21, 2025
7e1fb93
Audio Retrieval Dataset: EmoVDB (#2923)
AdnanElAssadi56 Jul 21, 2025
48febd1
MAEB Model MuQ-MuLan (#2909)
AdnanElAssadi56 Jul 21, 2025
7801759
fix encode() in audio models (#2926)
isaac-chung Jul 21, 2025
7a4be45
Audio Retrieval Dataset: HiFiTTS (#2924)
AdnanElAssadi56 Jul 21, 2025
8a01d4e
Audio Retrieval Dataset: MusicCaps (#2918)
AdnanElAssadi56 Jul 21, 2025
53071b3
Audio Retrieval Dataset: CMU-Arctic (#2929)
AdnanElAssadi56 Jul 23, 2025
b087dfe
Audio Models Batch Fix (#2932)
AdnanElAssadi56 Jul 23, 2025
aadd51e
Add AudioSet and AudioSetMini (#2952)
isaac-chung Jul 28, 2025
b875aa2
[MAEB] Fix whisper model audio inference (#2954)
isaac-chung Jul 30, 2025
54561ed
Common voice (#2951)
hepengfe Aug 2, 2025
d841b33
fleurs retrieval tasks (#2976)
hepengfe Aug 4, 2025
069b294
MAEB Model Evaluation Fixes (#2956)
AdnanElAssadi56 Aug 5, 2025
671be23
Fix ClothoA2T modality (#2988)
isaac-chung Aug 5, 2025
49528b6
Revert "MAEB Model Evaluation Fixes" (#2993)
isaac-chung Aug 6, 2025
5ba74fc
Human Subsets Tasks
AdnanElAssadi56 Aug 10, 2025
24ea203
Fixed Multilingual Classification Subset
AdnanElAssadi56 Aug 10, 2025
926acb2
add google embedding variant
AdnanElAssadi56 Sep 23, 2025
e134015
loader fix
AdnanElAssadi56 Sep 23, 2025
2438bb2
license add
AdnanElAssadi56 Sep 23, 2025
5a9667b
validator check
AdnanElAssadi56 Sep 23, 2025
5c97cd4
typo
AdnanElAssadi56 Sep 23, 2025
bd3d7a3
full loader
AdnanElAssadi56 Sep 23, 2025
bcc1bb0
typo
AdnanElAssadi56 Sep 23, 2025
d0b6b10
Merge branch 'main' of https://github.com/embeddings-benchmark/mteb i…
AdnanElAssadi56 Sep 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ on:
branches: [main]
pull_request:

permissions:
contents: write

jobs:
create-table-on-pr:
if: github.event_name == 'pull_request'
Expand All @@ -32,8 +35,6 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
token: ${{ secrets.RELEASE }}

- uses: actions/setup-python@v4
with:
Expand All @@ -49,6 +50,8 @@ jobs:
make build-docs

- name: Push table
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
git config --global user.email "github-actions[bot]@users.noreply.github.com"
git config --global user.name "github-actions[bot]"
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,3 @@ Some of these amazing publications include (ordered chronologically):
- Kenneth Enevoldsen, Márton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024
- Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024
- Chenghao Xiao, Isaac Chung, Imene Kerboua, Jamie Stirling, Xin Zhang, Márton Kardos, Roman Solomatin, Noura Al Moubayed, Kenneth Enevoldsen, Niklas Muennighoff. "[MIEB: Massive Image Embedding Benchmark](https://arxiv.org/abs/2504.10471)" arXiv 2025

4 changes: 2 additions & 2 deletions docs/adding_a_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ model_meta.calculate_memory_usage_mb()
### Adding instruction models

Some models, such as the [E5 models](https://huggingface.co/intfloat/multilingual-e5-large-instruct), use instructions or prompts.
You can directly add the prompts when saving and uploading your model to the Hub. Refer to this [configuration file as an example](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json).
You can directly add the prompts when saving and uploading your model to the Hub. Refer to this [configuration file as an example](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json).

However, you can also add these directly to the model configuration:

Expand Down Expand Up @@ -142,4 +142,4 @@ When submitting you models as a PR, please copy and paste the following checklis
- [ ] `mteb.get_model(model_name, revision)` and
- [ ] `mteb.get_model_meta(model_name, revision)`
- [ ] I have tested the implementation works on a representative set of tasks.
- [ ] The model is public, i.e. is available either as an API or the wieght are publicly avaiable to download
- [ ] The model is public, i.e. is available either as an API or the wieght are publicly avaiable to download
8 changes: 4 additions & 4 deletions docs/mieb/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ MIEB intends to extend MTEB and MMTEB to cover image representation learning and

## 🚀 Running MIEB

If you’re already familiar with how MTEB works, then run any benchmark, task, and model the same way!
If you’re already familiar with how MTEB works, then run any benchmark, task, and model the same way!


### Run MIEB in 2 lines via CLI
Expand Down Expand Up @@ -46,14 +46,14 @@ Or select tasks by categories:
tasks = mteb.get_tasks(task_types=["Compositionality"])
```

2. Load a Model:
2. Load a Model:

```python
model_name = "laion/CLIP-ViT-L-14-laion2B-s32B-b82K"
model = mteb.get_model(model_name=model_name)
```

3. Run the Evaluation:
3. Run the Evaluation:

```python
evaluation = mteb.MTEB(tasks=tasks)
Expand All @@ -71,7 +71,7 @@ There are a few ways for anyone to contribute to MIEB:
2. Add a model. This could mean either: a) The model wrapper, e.g. `OpenCLIPWrapper`, already exists, and the effort is solely in adding a filled out `ModelMeta` object, and/or b) Add a new model wrapper.
3. Add a new task type. This means that the existing task types do not cover this new task. An accompanying evaluator should also be implemented.

Let's go through an example.
Let's go through an example.

<details>
<summary> Contribution Example (click to unfold) </summary>
Expand Down
257 changes: 257 additions & 0 deletions mteb/abstasks/Audio/AbsTaskAudioClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
from __future__ import annotations

import logging
from collections import defaultdict
from typing import Any

import numpy as np
from sklearn.model_selection import KFold

from mteb.abstasks.TaskMetadata import HFSubset

from ...encoder_interface import AudioEncoder
from ...evaluation.evaluators import AudiologRegClassificationEvaluator
from ..AbsTask import AbsTask, ScoresDict

logger = logging.getLogger(__name__)


class AbsTaskAudioClassification(AbsTask):
"""Abstract class for audio classification tasks
The similarity is computed between pairs and the results are ranked.

self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It
must contain the following columns:
audio: datasets.Audio
label: int
"""

audio_column_name: str = "audio"
label_column_name: str = "labels"
is_cross_validation: bool = False
n_splits = 5 # by default: 5-fold cross-validation

def __init__(
self,
method: str = "logReg",
n_experiments: int | None = None,
samples_per_label: int | None = None,
k: int = 3,
**kwargs,
):
super().__init__(**kwargs)
self.method = method

# Bootstrap parameters
self.n_experiments: int = ( # type: ignore
n_experiments
if n_experiments is not None
else self.metadata_dict.get("n_experiments", 5)
)
self.samples_per_label: int = ( # type: ignore
samples_per_label
if samples_per_label is not None
else self.metadata_dict.get("samples_per_label", 16)
)

# kNN parameters
self.k = k

def _add_main_score(self, scores: dict[HFSubset, ScoresDict]) -> None:
scores["main_score"] = scores[self.metadata.main_score]

def _calculate_metrics_from_split(
self, split: str, hf_subset: str | None = None, compute_overall: bool = False
):
pass

def evaluate(
self,
model: AudioEncoder,
eval_split: str = "test",
train_split: str = "train",
*,
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> dict[HFSubset, ScoresDict]:
if not self.data_loaded:
self.load_data()

scores = {}
hf_subsets = list(self.dataset) if self.is_multilingual else ["default"]

for hf_subset in hf_subsets:
logger.info(
f"\nTask: {self.metadata.name}, split: {eval_split}, subset: {hf_subset}. Running..."
)

if hf_subset not in self.dataset and hf_subset == "default":
ds = self.dataset
else:
ds = self.dataset[hf_subset]

if self.is_cross_validation:
scores[hf_subset] = self._evaluate_subset_cross_validation(
model,
ds,
eval_split,
train_split,
encode_kwargs=encode_kwargs,
**kwargs,
)
else:
scores[hf_subset] = self._evaluate_subset(
model,
ds,
eval_split,
train_split,
encode_kwargs=encode_kwargs,
**kwargs,
)
self._add_main_score(scores[hf_subset])

return scores

def _evaluate_subset_cross_validation(
self,
model: AudioEncoder,
dataset,
eval_split: str = "test",
train_split: str = "train",
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> ScoresDict:
assert train_split == eval_split, (
f"Performing {self.n_splits}-fold cross validation, but the dataset has a train (`{train_split}`) and test split (`{eval_split}`)! Set `is_cross_validation` to False, and retry."
)
logger.info(
f"Performing {self.n_splits}-fold cross-validation on the entire dataset!"
)

ds = dataset[train_split]
num_samples = len(ds)
kf = KFold(n_splits=self.n_splits, shuffle=True, random_state=42)

params = {"k": self.k}
params.update(kwargs)
scores = []
test_cache, idxs = (
None,
None,
) # we store idxs to make the shuffling reproducible

for train_idx, val_idx in kf.split(
range(num_samples)
): # perform k-fold cross validation
train_split = ds.select(train_idx)
eval_split = ds.select(val_idx)

# Bootstrap `self.samples_per_label` samples per label for each split
undersampled_train, idxs = self._undersample_data(
train_split,
self.label_column_name,
self.samples_per_label,
idxs=idxs,
)

if self.method == "logReg":
evaluator = AudiologRegClassificationEvaluator(
undersampled_train,
eval_split,
self.audio_column_name,
self.label_column_name,
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
else:
raise ValueError(f"Method {self.method} not supported")

scores_exp, test_cache = evaluator(model, test_cache=test_cache)
scores.append(scores_exp)

avg_scores: dict[str, Any] = {
k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
}
avg_scores["scores_per_experiment"] = scores
return avg_scores

def _evaluate_subset(
self,
model: AudioEncoder,
dataset,
eval_split: str = "test",
train_split: str = "train",
encode_kwargs: dict[str, Any] = {},
**kwargs,
) -> ScoresDict:
train_split = dataset[train_split]
eval_split = dataset[eval_split]
params = {"k": self.k}
params.update(kwargs)

scores = []
test_cache, idxs = (
None,
None,
) # we store idxs to make the shuffling reproducible
for i in range(self.n_experiments):
logger.info(
"=" * 10 + f" Experiment {i + 1}/{self.n_experiments} " + "=" * 10
)
# Bootstrap `self.samples_per_label` samples per label for each split
undersampled_train, idxs = self._undersample_data(
train_split,
self.label_column_name,
self.samples_per_label,
idxs=idxs,
)

if self.method == "logReg":
evaluator = AudiologRegClassificationEvaluator(
undersampled_train,
eval_split,
self.audio_column_name,
self.label_column_name,
task_name=self.metadata.name,
encode_kwargs=encode_kwargs,
**params,
)
else:
raise ValueError(f"Method {self.method} not supported")

scores_exp, test_cache = evaluator(model, test_cache=test_cache)
scores.append(scores_exp)

avg_scores: dict[str, Any] = {
k: np.mean([s[k] for s in scores]) for k in scores[0].keys()
}
avg_scores["scores_per_experiment"] = scores
return avg_scores

def _undersample_data(
self, dataset_split, label_column_name, samples_per_label, idxs=None
):
"""Undersample data to have samples_per_label samples of each label
without loading all audio into memory.
"""
if idxs is None:
idxs = np.arange(len(dataset_split))
np.random.shuffle(idxs)
if not isinstance(idxs, list):
idxs = idxs.tolist()
label_counter = defaultdict(int)
selected_indices = []

labels = dataset_split[label_column_name]
for i in idxs:
label = labels[i]
if label_counter[label] < samples_per_label:
selected_indices.append(i)
label_counter[label] += 1

undersampled_dataset = dataset_split.select(selected_indices)
return (
undersampled_dataset,
idxs,
)
Loading
Loading