Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions mteb/descriptive_stats/AudioClassification/GLOBEV3Age.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
{
"test": {
"num_samples": 5000,
"number_texts_intersect_with_train": null,
"text_statistics": null,
"image_statistics": null,
"audio_statistics": {
"total_duration_seconds": 25920.202374999975,
"min_duration_seconds": 0.56,
"average_duration_seconds": 5.184040474999995,
"max_duration_seconds": 22.78,
"unique_audios": 5000,
"average_sampling_rate": 24000.0,
"sampling_rates": {
"24000": 5000
}
},
"label_statistics": {
"min_labels_per_text": 1,
"average_label_per_text": 1.0,
"max_labels_per_text": 1,
"unique_labels": 9,
"labels": {
"twenties": {
"count": 1378
},
"fifties": {
"count": 638
},
"teens": {
"count": 606
},
"fourties": {
"count": 570
},
"thirties": {
"count": 1382
},
"sixties": {
"count": 263
},
"seventies": {
"count": 160
},
"nineties": {
"count": 2
},
"eighties": {
"count": 1
}
}
}
},
"train": {
"num_samples": 5000,
"number_texts_intersect_with_train": null,
"text_statistics": null,
"image_statistics": null,
"audio_statistics": {
"total_duration_seconds": 25967.226791666697,
"min_duration_seconds": 1.08,
"average_duration_seconds": 5.1934453583333395,
"max_duration_seconds": 10.539958333333333,
"unique_audios": 5000,
"average_sampling_rate": 24000.0,
"sampling_rates": {
"24000": 5000
}
},
"label_statistics": {
"min_labels_per_text": 1,
"average_label_per_text": 1.0,
"max_labels_per_text": 1,
"unique_labels": 9,
"labels": {
"fourties": {
"count": 617
},
"thirties": {
"count": 617
},
"twenties": {
"count": 617
},
"teens": {
"count": 617
},
"fifties": {
"count": 618
},
"sixties": {
"count": 618
},
"seventies": {
"count": 618
},
"eighties": {
"count": 618
},
"nineties": {
"count": 60
}
}
}
}
}
64 changes: 64 additions & 0 deletions mteb/descriptive_stats/AudioClassification/GLOBEV3Gender.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
{
"test": {
"num_samples": 5000,
"number_texts_intersect_with_train": null,
"text_statistics": null,
"image_statistics": null,
"audio_statistics": {
"total_duration_seconds": 25873.349833333417,
"min_duration_seconds": 0.56,
"average_duration_seconds": 5.174669966666683,
"max_duration_seconds": 21.984,
"unique_audios": 5000,
"average_sampling_rate": 24000.0,
"sampling_rates": {
"24000": 5000
}
},
"label_statistics": {
"min_labels_per_text": 1,
"average_label_per_text": 1.0,
"max_labels_per_text": 1,
"unique_labels": 2,
"labels": {
"1": {
"count": 3275
},
"0": {
"count": 1725
}
}
}
},
"train": {
"num_samples": 5000,
"number_texts_intersect_with_train": null,
"text_statistics": null,
"image_statistics": null,
"audio_statistics": {
"total_duration_seconds": 25205.078791666667,
"min_duration_seconds": 0.96,
"average_duration_seconds": 5.041015758333334,
"max_duration_seconds": 10.4,
"unique_audios": 5000,
"average_sampling_rate": 24000.0,
"sampling_rates": {
"24000": 5000
}
},
"label_statistics": {
"min_labels_per_text": 1,
"average_label_per_text": 1.0,
"max_labels_per_text": 1,
"unique_labels": 2,
"labels": {
"0": {
"count": 2500
},
"1": {
"count": 2500
}
}
}
}
}
8 changes: 8 additions & 0 deletions mteb/tasks/classification/eng/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
from mteb.tasks.classification.eng.globe_v2_gender_classification import (
GlobeV2GenderClassification,
)
from mteb.tasks.classification.eng.globe_v3_age_classification import (
GlobeV3AgeClassification,
)
from mteb.tasks.classification.eng.globe_v3_gender_classification import (
GlobeV3GenderClassification,
)
from mteb.tasks.classification.eng.iemocap_emotion import IEMOCAPEmotionClassification
from mteb.tasks.classification.eng.iemocap_gender import IEMOCAPGenderClassification
from mteb.tasks.classification.eng.libri_count import LibriCount
Expand Down Expand Up @@ -383,6 +389,8 @@
"GTSRBClassification",
"GlobeV2AgeClassification",
"GlobeV2GenderClassification",
"GlobeV3AgeClassification",
"GlobeV3GenderClassification",
"HUMEEmotionClassification",
"HUMEToxicConversationsClassification",
"HUMETweetSentimentExtractionClassification",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class GlobeV2AgeClassification(AbsTaskClassification):
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2025-01-13", "2025-01-13"),
Expand Down
39 changes: 39 additions & 0 deletions mteb/tasks/classification/eng/globe_v3_age_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from mteb.abstasks.classification import AbsTaskClassification
from mteb.abstasks.task_metadata import TaskMetadata


class GlobeV3AgeClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="GLOBEV3Age",
description="Age classification from the GLOBE v3 dataset (sampled and enhanced from CommonVoice dataset for TTS purpose). This dataset is a stratified and downsampled version of the original dataset, containing about 535 hours of speech data across 164 accents. We use the age column as the target label for audio classification.",
reference="https://huggingface.co/datasets/MushanW/GLOBE_V3",
dataset={
"path": "mteb/globe-v3-age-mini",
"revision": "f7399f4b836508a178c0913868e82462b4a8919b",
},
type="AudioClassification",
category="a2t",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2025-05-26", "2025-05-26"),
domains=["Spoken", "Speech"],
task_subtypes=["Age Classification"],
license="cc0-1.0",
annotations_creators="automatic",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation=r"""
@misc{wang2024globe,
archiveprefix = {arXiv},
author = {Wenbin Wang and Yang Song and Sanjay Jha},
eprint = {2406.14875},
title = {GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech},
year = {2024},
}
""",
)

input_column_name: str = "audio"
label_column_name: str = "predicted_age"
39 changes: 39 additions & 0 deletions mteb/tasks/classification/eng/globe_v3_gender_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from mteb.abstasks.classification import AbsTaskClassification
from mteb.abstasks.task_metadata import TaskMetadata


class GlobeV3GenderClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="GLOBEV3Gender",
description="Gender classification from the GLOBE v3 dataset (sampled and enhanced from CommonVoice dataset for TTS purpose). This dataset is a stratified and downsampled version of the original dataset, containing about 535 hours of speech data across 164 accents. We use the gender column as the target label for audio classification.",
reference="https://huggingface.co/datasets/MushanW/GLOBE_V3",
dataset={
"path": "mteb/globe-v3-gender-mini",
"revision": "7020a6c14ec8a8e967013e04f2a695ead308bee1",
},
type="AudioClassification",
category="a2t",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2025-05-26", "2025-05-26"),
domains=["Spoken", "Speech"],
task_subtypes=["Gender Classification"],
license="cc0-1.0",
annotations_creators="automatic",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation=r"""
@misc{wang2024globe,
archiveprefix = {arXiv},
author = {Wenbin Wang and Yang Song and Sanjay Jha},
eprint = {2406.14875},
title = {GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech},
year = {2024},
}
""",
)

input_column_name: str = "audio"
label_column_name: str = "predicted_gender"
Loading