diff --git a/mteb/descriptive_stats/AudioClassification/GLOBEV3Age.json b/mteb/descriptive_stats/AudioClassification/GLOBEV3Age.json new file mode 100644 index 0000000000..325762115f --- /dev/null +++ b/mteb/descriptive_stats/AudioClassification/GLOBEV3Age.json @@ -0,0 +1,106 @@ +{ + "test": { + "num_samples": 5000, + "number_texts_intersect_with_train": null, + "text_statistics": null, + "image_statistics": null, + "audio_statistics": { + "total_duration_seconds": 25920.202374999975, + "min_duration_seconds": 0.56, + "average_duration_seconds": 5.184040474999995, + "max_duration_seconds": 22.78, + "unique_audios": 5000, + "average_sampling_rate": 24000.0, + "sampling_rates": { + "24000": 5000 + } + }, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 9, + "labels": { + "twenties": { + "count": 1378 + }, + "fifties": { + "count": 638 + }, + "teens": { + "count": 606 + }, + "fourties": { + "count": 570 + }, + "thirties": { + "count": 1382 + }, + "sixties": { + "count": 263 + }, + "seventies": { + "count": 160 + }, + "nineties": { + "count": 2 + }, + "eighties": { + "count": 1 + } + } + } + }, + "train": { + "num_samples": 5000, + "number_texts_intersect_with_train": null, + "text_statistics": null, + "image_statistics": null, + "audio_statistics": { + "total_duration_seconds": 25967.226791666697, + "min_duration_seconds": 1.08, + "average_duration_seconds": 5.1934453583333395, + "max_duration_seconds": 10.539958333333333, + "unique_audios": 5000, + "average_sampling_rate": 24000.0, + "sampling_rates": { + "24000": 5000 + } + }, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 9, + "labels": { + "fourties": { + "count": 617 + }, + "thirties": { + "count": 617 + }, + "twenties": { + "count": 617 + }, + "teens": { + "count": 617 + }, + "fifties": { + "count": 618 + }, + "sixties": { + "count": 618 + }, + "seventies": { + "count": 618 + }, + "eighties": { + "count": 618 + }, + "nineties": { + "count": 60 + } + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/AudioClassification/GLOBEV3Gender.json b/mteb/descriptive_stats/AudioClassification/GLOBEV3Gender.json new file mode 100644 index 0000000000..d56b9e88f8 --- /dev/null +++ b/mteb/descriptive_stats/AudioClassification/GLOBEV3Gender.json @@ -0,0 +1,64 @@ +{ + "test": { + "num_samples": 5000, + "number_texts_intersect_with_train": null, + "text_statistics": null, + "image_statistics": null, + "audio_statistics": { + "total_duration_seconds": 25873.349833333417, + "min_duration_seconds": 0.56, + "average_duration_seconds": 5.174669966666683, + "max_duration_seconds": 21.984, + "unique_audios": 5000, + "average_sampling_rate": 24000.0, + "sampling_rates": { + "24000": 5000 + } + }, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 3275 + }, + "0": { + "count": 1725 + } + } + } + }, + "train": { + "num_samples": 5000, + "number_texts_intersect_with_train": null, + "text_statistics": null, + "image_statistics": null, + "audio_statistics": { + "total_duration_seconds": 25205.078791666667, + "min_duration_seconds": 0.96, + "average_duration_seconds": 5.041015758333334, + "max_duration_seconds": 10.4, + "unique_audios": 5000, + "average_sampling_rate": 24000.0, + "sampling_rates": { + "24000": 5000 + } + }, + "label_statistics": { + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 2500 + }, + "1": { + "count": 2500 + } + } + } + } +} \ No newline at end of file diff --git a/mteb/tasks/classification/eng/__init__.py b/mteb/tasks/classification/eng/__init__.py index 9c99ebc387..8d31917eac 100644 --- a/mteb/tasks/classification/eng/__init__.py +++ b/mteb/tasks/classification/eng/__init__.py @@ -23,6 +23,12 @@ from mteb.tasks.classification.eng.globe_v2_gender_classification import ( GlobeV2GenderClassification, ) +from mteb.tasks.classification.eng.globe_v3_age_classification import ( + GlobeV3AgeClassification, +) +from mteb.tasks.classification.eng.globe_v3_gender_classification import ( + GlobeV3GenderClassification, +) from mteb.tasks.classification.eng.iemocap_emotion import IEMOCAPEmotionClassification from mteb.tasks.classification.eng.iemocap_gender import IEMOCAPGenderClassification from mteb.tasks.classification.eng.libri_count import LibriCount @@ -383,6 +389,8 @@ "GTSRBClassification", "GlobeV2AgeClassification", "GlobeV2GenderClassification", + "GlobeV3AgeClassification", + "GlobeV3GenderClassification", "HUMEEmotionClassification", "HUMEToxicConversationsClassification", "HUMETweetSentimentExtractionClassification", diff --git a/mteb/tasks/classification/eng/globe_v2_age_classification.py b/mteb/tasks/classification/eng/globe_v2_age_classification.py index c8b09e6966..820fb4ab59 100644 --- a/mteb/tasks/classification/eng/globe_v2_age_classification.py +++ b/mteb/tasks/classification/eng/globe_v2_age_classification.py @@ -13,7 +13,7 @@ class GlobeV2AgeClassification(AbsTaskClassification): }, type="AudioClassification", category="a2t", - eval_splits=["train"], + eval_splits=["test"], eval_langs=["eng-Latn"], main_score="accuracy", date=("2025-01-13", "2025-01-13"), diff --git a/mteb/tasks/classification/eng/globe_v3_age_classification.py b/mteb/tasks/classification/eng/globe_v3_age_classification.py new file mode 100644 index 0000000000..3450b653f3 --- /dev/null +++ b/mteb/tasks/classification/eng/globe_v3_age_classification.py @@ -0,0 +1,39 @@ +from mteb.abstasks.classification import AbsTaskClassification +from mteb.abstasks.task_metadata import TaskMetadata + + +class GlobeV3AgeClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="GLOBEV3Age", + description="Age classification from the GLOBE v3 dataset (sampled and enhanced from CommonVoice dataset for TTS purpose). This dataset is a stratified and downsampled version of the original dataset, containing about 535 hours of speech data across 164 accents. We use the age column as the target label for audio classification.", + reference="https://huggingface.co/datasets/MushanW/GLOBE_V3", + dataset={ + "path": "mteb/globe-v3-age-mini", + "revision": "f7399f4b836508a178c0913868e82462b4a8919b", + }, + type="AudioClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2025-05-26", "2025-05-26"), + domains=["Spoken", "Speech"], + task_subtypes=["Age Classification"], + license="cc0-1.0", + annotations_creators="automatic", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@misc{wang2024globe, + archiveprefix = {arXiv}, + author = {Wenbin Wang and Yang Song and Sanjay Jha}, + eprint = {2406.14875}, + title = {GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech}, + year = {2024}, +} +""", + ) + + input_column_name: str = "audio" + label_column_name: str = "predicted_age" diff --git a/mteb/tasks/classification/eng/globe_v3_gender_classification.py b/mteb/tasks/classification/eng/globe_v3_gender_classification.py new file mode 100644 index 0000000000..bf8f0cda07 --- /dev/null +++ b/mteb/tasks/classification/eng/globe_v3_gender_classification.py @@ -0,0 +1,39 @@ +from mteb.abstasks.classification import AbsTaskClassification +from mteb.abstasks.task_metadata import TaskMetadata + + +class GlobeV3GenderClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="GLOBEV3Gender", + description="Gender classification from the GLOBE v3 dataset (sampled and enhanced from CommonVoice dataset for TTS purpose). This dataset is a stratified and downsampled version of the original dataset, containing about 535 hours of speech data across 164 accents. We use the gender column as the target label for audio classification.", + reference="https://huggingface.co/datasets/MushanW/GLOBE_V3", + dataset={ + "path": "mteb/globe-v3-gender-mini", + "revision": "7020a6c14ec8a8e967013e04f2a695ead308bee1", + }, + type="AudioClassification", + category="a2t", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2025-05-26", "2025-05-26"), + domains=["Spoken", "Speech"], + task_subtypes=["Gender Classification"], + license="cc0-1.0", + annotations_creators="automatic", + dialect=[], + modalities=["audio"], + sample_creation="found", + bibtex_citation=r""" +@misc{wang2024globe, + archiveprefix = {arXiv}, + author = {Wenbin Wang and Yang Song and Sanjay Jha}, + eprint = {2406.14875}, + title = {GLOBE: A High-quality English Corpus with Global Accents for Zero-shot Speaker Adaptive Text-to-Speech}, + year = {2024}, +} +""", + ) + + input_column_name: str = "audio" + label_column_name: str = "predicted_gender"