diff --git a/mteb/tasks/Audio/Any2AnyRetrieval/CMU_Arctic.py b/mteb/tasks/Audio/Any2AnyRetrieval/CMU_Arctic.py new file mode 100644 index 0000000000..803fadac1b --- /dev/null +++ b/mteb/tasks/Audio/Any2AnyRetrieval/CMU_Arctic.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CMUArcticA2TRetrieval(AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="CMUArcticA2TRetrieval", + description=( + "Retrieve the correct transcription for an English speech segment. " + "The dataset is derived from the phonetically balanced CMU Arctic single-speaker TTS corpora. " + "The corpora contains 1150 samples based on read-aloud segments from books, which are out of copyright " + "and derived from the Gutenberg project." + ), + reference="http://festvox.org/cmu_arctic/", + dataset={ + "path": "mteb/CMU_Arctic_a2t", + "revision": "68e5228b82d03c20c22322ad22008464a32f960b", + }, + type="Any2AnyRetrieval", + category="a2t", + modalities=["text", "audio"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="cv_recall_at_5", + date=("2000-01-01", "2002-12-31"), + domains=["Spoken"], + task_subtypes=["Speech Transcription Retrieval"], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@techreport{cmu-lti-03-177, + author = {Clark, Rob and Richmond, Keith}, + institution = {Carnegie Mellon University, Language Technologies Institute}, + number = {CMU-LTI-03-177}, + title = {A detailed report on the CMU Arctic speech database}, + year = {2003}, +} +""", + ) + + +class CMUArcticT2ARetrieval(AbsTaskAny2AnyRetrieval): + """Text-to-audio retrieval on CMU Arctic transcription ↔ audio pairs.""" + + metadata = TaskMetadata( + name="CMUArcticT2ARetrieval", + description=( + "Retrieve the correct audio segment for an English transcription. " + "The dataset is derived from the phonetically balanced CMU Arctic single-speaker TTS corpora. " + "The corpora contains 1150 audio-text pairs based on read-aloud segments from public domain books " + "originally sourced from the Gutenberg project." + ), + reference="http://festvox.org/cmu_arctic/", + dataset={ + "path": "mteb/CMU_Arctic_t2a", + "revision": "7c845fdfe355c226096203ffd4cdead3229950dc", + }, + type="Any2AnyRetrieval", + category="t2a", + modalities=["text", "audio"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="cv_recall_at_5", + date=("2000-01-01", "2002-12-31"), + domains=["Spoken"], + task_subtypes=["Speech Transcription Retrieval"], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@techreport{cmu-lti-03-177, + author = {Clark, Rob and Richmond, Keith}, + institution = {Carnegie Mellon University, Language Technologies Institute}, + number = {CMU-LTI-03-177}, + title = {A detailed report on the CMU Arctic speech database}, + year = {2003}, +} +""", + ) diff --git a/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py b/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py index 26a990cd16..c6a4ca7ebb 100644 --- a/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py +++ b/mteb/tasks/Audio/Any2AnyRetrieval/__init__.py @@ -2,6 +2,7 @@ from .AudioCaps import * from .Clotho import * +from .CMU_Arctic import * from .EmoVDB import * from .HiFiTTS import * from .MACS import *