From 2fb3e3e6b970b3b1c8118a2b938b3b595872751a Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Sun, 6 Apr 2025 14:50:13 +0530 Subject: [PATCH 1/8] Fix Metadata in tasks to solve filtering issue --- mteb/abstasks/TaskMetadata.py | 13 +++++++-- .../fas/FaMTEBClassification.py | 4 +-- .../pol/PolishClassification.py | 2 +- .../Clustering/deu/BlurbsClusteringP2P.py | 2 +- .../Clustering/deu/BlurbsClusteringS2S.py | 2 +- .../Clustering/deu/TenKGnadClusteringS2S.py | 6 ++--- .../Clustering/fra/AlloProfClusteringP2P.py | 12 ++++----- .../Clustering/fra/AlloProfClusteringS2S.py | 12 ++++----- mteb/tasks/Clustering/fra/HALClusteringS2S.py | 12 ++++----- .../multilingual/MasakhaNEWSClusteringS2S.py | 10 +++---- .../fas/FaMTEBPairClassification.py | 6 ++--- mteb/tasks/PairClassification/pol/PolishPC.py | 2 +- mteb/tasks/Reranking/zho/CMTEBReranking.py | 4 +-- mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py | 4 +-- .../tasks/Retrieval/deu/GermanDPRRetrieval.py | 6 ++--- .../Retrieval/deu/GermanQuADRetrieval.py | 6 ++--- mteb/tasks/Retrieval/eng/FEVERRetrieval.py | 8 +++--- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 27 ++++++++++++------- mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py | 6 ++--- mteb/tasks/STS/fas/FaMTEBSTS.py | 4 +-- mteb/tasks/STS/fra/SickFrSTS.py | 2 +- mteb/tasks/STS/zho/CMTEBSTS.py | 2 +- 22 files changed, 85 insertions(+), 67 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b77887e4d1..a56b54fb91 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -56,6 +56,9 @@ "Tumor detection", "Duplicate Detection", "Rendered semantic textual similarity", + "Passage retrieval", + "Passage ranking", + "Legal information retrieval", ] TASK_DOMAIN = Literal[ @@ -66,6 +69,7 @@ "Engineering", "Fiction", "Government", + "General", "Legal", "Medical", "News", @@ -82,8 +86,13 @@ "Programming", "Chemistry", "Financial", - "Chemistry", - "Financial", + "Entertainment", + "E-commerce", + "Vehicle", + "IT", + "Emotion", + "Books", + "Not Specified", ] SAMPLE_CREATION_METHOD = Literal[ diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py index 43c7971429..92fb0588e7 100644 --- a/mteb/tasks/Classification/fas/FaMTEBClassification.py +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -452,7 +452,7 @@ class PersianTextTone(AbsTaskClassification): eval_langs=["fas-Arab"], main_score="accuracy", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Not Specified"], task_subtypes=["Sentiment/Hate speech"], license="not specified", annotations_creators="LM-generated", @@ -536,7 +536,7 @@ class PersianTextEmotion(AbsTaskClassification): eval_langs=["fas-Arab"], main_score="accuracy", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Emotion"], task_subtypes=["Sentiment/Hate speech"], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/Classification/pol/PolishClassification.py b/mteb/tasks/Classification/pol/PolishClassification.py index c0963e8283..309292d094 100644 --- a/mteb/tasks/Classification/pol/PolishClassification.py +++ b/mteb/tasks/Classification/pol/PolishClassification.py @@ -123,7 +123,7 @@ class AllegroReviewsClassification(AbsTaskClassification): eval_langs=["pol-Latn"], main_score="accuracy", date=None, - domains=None, + domains=["E-commerce"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py index dbe155658e..d47629b5fd 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py @@ -26,7 +26,7 @@ class BlurbsClusteringP2P(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, + domains=["Books"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py index 1a54ed8242..fc226f3071 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py @@ -34,7 +34,7 @@ class BlurbsClusteringS2S(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, + domains=["Books"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py index f62c0f0aca..d4ac2f2581 100644 --- a/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/TenKGnadClusteringS2S.py @@ -24,8 +24,8 @@ class TenKGnadClusteringS2S(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=None, - task_subtypes=None, + domains=["News", "Non-fiction", "Written"], + task_subtypes=["Topic classification"], license=None, annotations_creators=None, dialect=None, @@ -57,7 +57,7 @@ class TenKGnadClusteringS2SFast(AbsTaskClusteringFast): "2020-12-31", ), # since it is news it is guessed that it is from 2000 to 2020 domains=["News", "Non-fiction", "Written"], - task_subtypes=None, + task_subtypes=["Topic classification"], license="cc-by-sa-4.0", annotations_creators="derived", dialect=[], diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py index 0e95b82773..b1ef7f09b6 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringP2P.py @@ -30,14 +30,14 @@ class AlloProfClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("1996-01-01", "2023-04-14"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Thematic clustering"], + license="mit", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, + sample_creation="found", bibtex_citation="""@misc{lef23, doi = {10.48550/ARXIV.2302.07738}, url = {https://arxiv.org/abs/2302.07738}, diff --git a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py index 1b4f609827..fcd2e18455 100644 --- a/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/AlloProfClusteringS2S.py @@ -30,14 +30,14 @@ class AlloProfClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("1996-01-01", "2023-04-14"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Thematic clustering"], + license="mit", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, + sample_creation="found", bibtex_citation="""@misc{lef23, doi = {10.48550/ARXIV.2302.07738}, url = {https://arxiv.org/abs/2302.07738}, diff --git a/mteb/tasks/Clustering/fra/HALClusteringS2S.py b/mteb/tasks/Clustering/fra/HALClusteringS2S.py index c6254befe6..cb4cc319a7 100644 --- a/mteb/tasks/Clustering/fra/HALClusteringS2S.py +++ b/mteb/tasks/Clustering/fra/HALClusteringS2S.py @@ -32,14 +32,14 @@ class HALClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["fra-Latn"], main_score="v_measure", - date=None, + date=("2000-03-29", "2024-05-24"), form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Academic", "Written"], + task_subtypes=["Thematic clustering"], + license="apache-2.0", + annotations_creators="human-annotated", dialect=None, - sample_creation=None, + sample_creation="found", bibtex_citation="""@misc{ciancone2024extending, title={Extending the Massive Text Embedding Benchmark to French}, author={Mathieu Ciancone and Imene Kerboua and Marion Schaeffer and Wissam Siblini}, diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py index 7e8b22b9af..21cedcfeba 100644 --- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py +++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py @@ -45,11 +45,11 @@ class MasakhaNEWSClusteringS2S(AbsTaskClustering, MultilingualTask): eval_splits=["test"], eval_langs=_LANGUAGES, main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + date=("2023-04-21", "2023-05-26"), + domains=["News"], + task_subtypes=["Topic classification"], + license="afl-3.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, bibtex_citation="""@article{adelani2023masakhanews, diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py index 6deba76d8d..14efbbadba 100644 --- a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -99,7 +99,7 @@ class FarsiParaphraseDetection(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Not Specified"], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -214,7 +214,7 @@ class ParsinluEntail(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Reviews"], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -257,7 +257,7 @@ class ParsinluQueryParaphPC(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Reviews"], task_subtypes=[], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/PairClassification/pol/PolishPC.py b/mteb/tasks/PairClassification/pol/PolishPC.py index 099a953642..9e431b05ec 100644 --- a/mteb/tasks/PairClassification/pol/PolishPC.py +++ b/mteb/tasks/PairClassification/pol/PolishPC.py @@ -21,7 +21,7 @@ class SickePLPC(AbsTaskPairClassification): eval_langs=["pol-Latn"], main_score="max_ap", date=None, - domains=None, + domains=["Reviews"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index c701aa9227..3dcdc3a5da 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -21,9 +21,9 @@ class T2Reranking(AbsTaskReranking): main_score="map", date=None, form=None, - domains=None, + domains=["Not Specified"], task_subtypes=None, - license=None, + license="not specified", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py index 111eb986ed..b924661676 100644 --- a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py @@ -25,8 +25,8 @@ class GerDaLIR(AbsTaskRetrieval): eval_langs=["deu-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, + domains=["Legal"], + task_subtypes=["Legal information retrieval"], license=None, annotations_creators=None, dialect=None, diff --git a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py index a3118b8f73..e2ef49b1b7 100644 --- a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py @@ -26,9 +26,9 @@ class GermanDPR(AbsTaskRetrieval): eval_langs=["deu-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, + domains=["Encyclopaedic"], + task_subtypes=["Passage retrieval"], + license="cc-by-4.0", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py index ba6a21e96e..a210d8f451 100644 --- a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py @@ -43,9 +43,9 @@ class GermanQuADRetrieval(AbsTaskRetrieval): eval_langs=["deu-Latn"], main_score="mrr_at_5", date=None, - domains=None, - task_subtypes=None, - license=None, + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="cc-by-4.0", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py index 2a6130e804..a4513179eb 100644 --- a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py @@ -80,10 +80,10 @@ class FEVERHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, bibtex_citation="""@inproceedings{thorne-etal-2018-fever, diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index eec977f926..cbb83dbf71 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -44,11 +44,20 @@ class T2Retrieval(AbsTaskRetrieval): eval_splits=["dev"], eval_langs=["cmn-Hans"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, + date=("2023-04-04", "2023-05-16"), + domains=[ + "Medical", + "Academic", + "E-commerce", + "General", + "Government", + "Emotion", + "IT", + "Vehicle", + ], + task_subtypes=["Passage ranking"], + license="apache-2.0", + annotations_creators="human-annotated", dialect=None, sample_creation=None, bibtex_citation="""@misc{xie2023t2ranking, @@ -193,11 +202,11 @@ class CovidRetrieval(AbsTaskRetrieval): eval_splits=["dev"], eval_langs=["cmn-Hans"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, + date=("2022-03-03", "2022-03-18"), + domains=["Medical", "Entertainment", "E-commerce"], + task_subtypes=["Passage retrieval"], license=None, - annotations_creators=None, + annotations_creators="human-annotated", dialect=None, sample_creation=None, bibtex_citation=None, diff --git a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py index 34add4378e..68ad7062c6 100644 --- a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py @@ -21,10 +21,10 @@ class GermanSTSBenchmarkSTS(AbsTaskSTS): eval_splits=["validation", "test"], eval_langs=["deu-Latn"], main_score="cosine_spearman", - date=None, - domains=None, + date=("2023-11-09", "2024-01-24"), + domains=["Not Specified"], task_subtypes=None, - license=None, + license="cc-by-sa-3.0", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py index 2ce9522cd4..1014243c7a 100644 --- a/mteb/tasks/STS/fas/FaMTEBSTS.py +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -21,7 +21,7 @@ class Farsick(AbsTaskSTS): eval_langs=["fas-Arab"], main_score="cosine_spearman", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Not Specified"], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -87,7 +87,7 @@ class Query2Query(AbsTaskSTS): eval_langs=["fas-Arab"], main_score="cosine_spearman", date=("2024-09-01", "2024-12-31"), - domains=[], + domains=["Not Specified"], task_subtypes=[], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/STS/fra/SickFrSTS.py b/mteb/tasks/STS/fra/SickFrSTS.py index 241aa60163..e91f0ad835 100644 --- a/mteb/tasks/STS/fra/SickFrSTS.py +++ b/mteb/tasks/STS/fra/SickFrSTS.py @@ -21,7 +21,7 @@ class SickFrSTS(AbsTaskSTS): eval_langs=["fra-Latn"], main_score="cosine_spearman", date=None, - domains=None, + domains=["Not Specified"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/STS/zho/CMTEBSTS.py b/mteb/tasks/STS/zho/CMTEBSTS.py index c7c0134d2a..7e030eeafe 100644 --- a/mteb/tasks/STS/zho/CMTEBSTS.py +++ b/mteb/tasks/STS/zho/CMTEBSTS.py @@ -196,7 +196,7 @@ class STSB(AbsTaskSTS): eval_langs=["cmn-Hans"], main_score="cosine_spearman", date=None, - domains=None, + domains=["Not Specified"], task_subtypes=None, license=None, annotations_creators=None, From 7444f9a046b95acca5f265df6170a5dda7c88df3 Mon Sep 17 00:00:00 2001 From: Munot Ayush Sunil Date: Sun, 6 Apr 2025 20:09:49 +0530 Subject: [PATCH 2/8] Update mteb/tasks/Retrieval/zho/CMTEBRetrieval.py Co-authored-by: Isaac Chung --- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index cbb83dbf71..e84b88e4f8 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -205,7 +205,7 @@ class CovidRetrieval(AbsTaskRetrieval): date=("2022-03-03", "2022-03-18"), domains=["Medical", "Entertainment", "E-commerce"], task_subtypes=["Passage retrieval"], - license=None, + license='not specified, annotations_creators="human-annotated", dialect=None, sample_creation=None, From 4d8f57e0e6a55ff7b88d5aeb395c2d10d8a52d5d Mon Sep 17 00:00:00 2001 From: Munot Ayush Sunil Date: Sun, 6 Apr 2025 20:09:57 +0530 Subject: [PATCH 3/8] Update mteb/tasks/Retrieval/zho/CMTEBRetrieval.py Co-authored-by: Isaac Chung --- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index e84b88e4f8..d8728359c6 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -207,7 +207,7 @@ class CovidRetrieval(AbsTaskRetrieval): task_subtypes=["Passage retrieval"], license='not specified, annotations_creators="human-annotated", - dialect=None, + dialect=[], sample_creation=None, bibtex_citation=None, prompt={ From e9a8ba73320e3910698278ed0602153f9d75e761 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Fri, 11 Apr 2025 17:01:03 +0530 Subject: [PATCH 4/8] Added citations and address comments --- mteb/abstasks/TaskMetadata.py | 2 +- mteb/leaderboard/app.py | 9 +++- .../multilingual/MasakhaNEWSClusteringS2S.py | 2 +- .../fas/FaMTEBPairClassification.py | 31 +++++++++-- mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py | 2 +- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 54 ++++++++++++++++--- mteb/tasks/STS/fra/SickFrSTS.py | 2 +- 7 files changed, 86 insertions(+), 16 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index a56b54fb91..e63f656e53 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -89,7 +89,7 @@ "Entertainment", "E-commerce", "Vehicle", - "IT", + "Information Technology", "Emotion", "Books", "Not Specified", diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e3833b5ce3..9d5f51f291 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -579,8 +579,13 @@ def update_task_list( for task in mteb.get_benchmark(benchmark_name).tasks: if task.metadata.type not in type_select: continue - if not (set(task.metadata.domains or []) & set(domain_select)): - continue + # if not (set(task.metadata.domains or []) & set(domain_select)): + # continue + if task.metadata.domains is not None: + if not (set(task.metadata.domains) & set(domain_select)): + continue + else: + pass if not (set(task.languages or []) & set(lang_select)): continue if not (set(task.metadata.modalities or []) & set(modality_select)): diff --git a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py index 21cedcfeba..7af80b5cdd 100644 --- a/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py +++ b/mteb/tasks/Clustering/multilingual/MasakhaNEWSClusteringS2S.py @@ -46,7 +46,7 @@ class MasakhaNEWSClusteringS2S(AbsTaskClustering, MultilingualTask): eval_langs=_LANGUAGES, main_score="v_measure", date=("2023-04-21", "2023-05-26"), - domains=["News"], + domains=["News", "Written"], task_subtypes=["Topic classification"], license="afl-3.0", annotations_creators="human-annotated", diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py index 14efbbadba..830ec18014 100644 --- a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -26,7 +26,16 @@ class CExaPPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation="""@INPROCEEDINGS{9786243, + author={Sadeghi, Reyhaneh and Karbasi, Hamed and Akbari, Ahmad}, + booktitle={2022 8th International Conference on Web Research (ICWR)}, + title={ExaPPC: a Large-Scale Persian Paraphrase Detection Corpus}, + year={2022}, + volume={}, + number={}, + pages={168-175}, + keywords={Data mining;Task analysis;Paraphrase Identification;Semantic Similarity;Deep Learning;Paraphrasing Corpora}, + doi={10.1109/ICWR54782.2022.9786243}}""", ) def dataset_transform(self): @@ -220,7 +229,15 @@ class ParsinluEntail(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation="""@misc{khashabi2021parsinlusuitelanguageunderstanding, + title={ParsiNLU: A Suite of Language Understanding Challenges for Persian}, + author={Daniel Khashabi and Arman Cohan and Siamak Shakeri and Pedram Hosseini and Pouya Pezeshkpour and Malihe Alikhani and Moin Aminnaseri and Marzieh Bitaab and Faeze Brahman and Sarik Ghazarian and Mozhdeh Gheini and Arman Kabiri and Rabeeh Karimi Mahabadi and Omid Memarrast and Ahmadreza Mosallanezhad and Erfan Noury and Shahab Raji and Mohammad Sadegh Rasooli and Sepideh Sadeghi and Erfan Sadeqi Azer and Niloofar Safi Samghabadi and Mahsa Shafaei and Saber Sheybani and Ali Tazarv and Yadollah Yaghoobzadeh}, + year={2021}, + eprint={2012.06154}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2012.06154}, +}""", ) def dataset_transform(self): @@ -263,7 +280,15 @@ class ParsinluQueryParaphPC(AbsTaskPairClassification): annotations_creators="derived", dialect=[], sample_creation="found", - bibtex_citation=""" """, + bibtex_citation="""@misc{khashabi2021parsinlusuitelanguageunderstanding, + title={ParsiNLU: A Suite of Language Understanding Challenges for Persian}, + author={Daniel Khashabi and Arman Cohan and Siamak Shakeri and Pedram Hosseini and Pouya Pezeshkpour and Malihe Alikhani and Moin Aminnaseri and Marzieh Bitaab and Faeze Brahman and Sarik Ghazarian and Mozhdeh Gheini and Arman Kabiri and Rabeeh Karimi Mahabadi and Omid Memarrast and Ahmadreza Mosallanezhad and Erfan Noury and Shahab Raji and Mohammad Sadegh Rasooli and Sepideh Sadeghi and Erfan Sadeqi Azer and Niloofar Safi Samghabadi and Mahsa Shafaei and Saber Sheybani and Ali Tazarv and Yadollah Yaghoobzadeh}, + year={2021}, + eprint={2012.06154}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2012.06154}, +}""", ) def dataset_transform(self): diff --git a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py index b924661676..745a0fe60d 100644 --- a/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py @@ -26,7 +26,7 @@ class GerDaLIR(AbsTaskRetrieval): main_score="ndcg_at_10", date=None, domains=["Legal"], - task_subtypes=["Legal information retrieval"], + task_subtypes=[], license=None, annotations_creators=None, dialect=None, diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index d8728359c6..b17ac7ad44 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -52,7 +52,7 @@ class T2Retrieval(AbsTaskRetrieval): "General", "Government", "Emotion", - "IT", + "Information Technology", "Vehicle", ], task_subtypes=["Passage ranking"], @@ -205,11 +205,19 @@ class CovidRetrieval(AbsTaskRetrieval): date=("2022-03-03", "2022-03-18"), domains=["Medical", "Entertainment", "E-commerce"], task_subtypes=["Passage retrieval"], - license='not specified, + license="not specified", annotations_creators="human-annotated", dialect=[], sample_creation=None, - bibtex_citation=None, + bibtex_citation="""@misc{long2022multicprmultidomainchinese, + title={Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + author={Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + year={2022}, + eprint={2203.03367}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2203.03367}, +}""", prompt={ "query": "Given a question on COVID-19, retrieve news articles that answer the question" }, @@ -251,7 +259,15 @@ class CmedqaRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation="""@misc{qiu2022dureaderretrievallargescalechinesebenchmark, + title={DuReader_retrieval: A Large-scale Chinese Benchmark for Passage Retrieval from Web Search Engine}, + author={Yifu Qiu and Hongyu Li and Yingqi Qu and Ying Chen and Qiaoqiao She and Jing Liu and Hua Wu and Haifeng Wang}, + year={2022}, + eprint={2203.10232}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2203.10232}, +}""", prompt={ "query": "Given a Chinese community medical question, retrieve replies that best answer the question" }, @@ -295,7 +311,15 @@ class EcomRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation="""@misc{long2022multicprmultidomainchinese, + title={Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + author={Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + year={2022}, + eprint={2203.03367}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2203.03367}, +}""", prompt={ "query": "Given a user query from an e-commerce website, retrieve description sentences of relevant products" }, @@ -339,7 +363,15 @@ class MedicalRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation="""@misc{long2022multicprmultidomainchinese, + title={Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + author={Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + year={2022}, + eprint={2203.03367}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2203.03367}, +}""", prompt={ "query": "Given a medical question, retrieve user replies that best answer the question" }, @@ -383,7 +415,15 @@ class VideoRetrieval(AbsTaskRetrieval): annotations_creators=None, dialect=None, sample_creation=None, - bibtex_citation=None, + bibtex_citation="""@misc{long2022multicprmultidomainchinese, + title={Multi-CPR: A Multi Domain Chinese Dataset for Passage Retrieval}, + author={Dingkun Long and Qiong Gao and Kuan Zou and Guangwei Xu and Pengjun Xie and Ruijie Guo and Jian Xu and Guanjun Jiang and Luxi Xing and Ping Yang}, + year={2022}, + eprint={2203.03367}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2203.03367}, +}""", prompt={ "query": "Given a video search query, retrieve the titles of relevant videos" }, diff --git a/mteb/tasks/STS/fra/SickFrSTS.py b/mteb/tasks/STS/fra/SickFrSTS.py index e91f0ad835..c34a933121 100644 --- a/mteb/tasks/STS/fra/SickFrSTS.py +++ b/mteb/tasks/STS/fra/SickFrSTS.py @@ -21,7 +21,7 @@ class SickFrSTS(AbsTaskSTS): eval_langs=["fra-Latn"], main_score="cosine_spearman", date=None, - domains=["Not Specified"], + domains=[], task_subtypes=None, license=None, annotations_creators=None, From 0dfcfbe728f9c5d913c56894e94f2ecdcc7c553c Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Wed, 16 Apr 2025 20:35:38 +0530 Subject: [PATCH 5/8] address comments --- mteb/abstasks/TaskMetadata.py | 9 --------- mteb/tasks/Classification/fas/FaMTEBClassification.py | 2 +- mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py | 2 +- .../PairClassification/fas/FaMTEBPairClassification.py | 6 +++--- mteb/tasks/Reranking/zho/CMTEBReranking.py | 2 +- mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py | 2 +- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 9 +++------ mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py | 2 +- mteb/tasks/STS/fas/FaMTEBSTS.py | 4 ++-- mteb/tasks/STS/zho/CMTEBSTS.py | 2 +- 10 files changed, 14 insertions(+), 26 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index e63f656e53..45519cc390 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -56,9 +56,6 @@ "Tumor detection", "Duplicate Detection", "Rendered semantic textual similarity", - "Passage retrieval", - "Passage ranking", - "Legal information retrieval", ] TASK_DOMAIN = Literal[ @@ -69,7 +66,6 @@ "Engineering", "Fiction", "Government", - "General", "Legal", "Medical", "News", @@ -87,11 +83,6 @@ "Chemistry", "Financial", "Entertainment", - "E-commerce", - "Vehicle", - "Information Technology", - "Emotion", - "Books", "Not Specified", ] diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py index 92fb0588e7..6a7836d45b 100644 --- a/mteb/tasks/Classification/fas/FaMTEBClassification.py +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -452,7 +452,7 @@ class PersianTextTone(AbsTaskClassification): eval_langs=["fas-Arab"], main_score="accuracy", date=("2024-09-01", "2024-12-31"), - domains=["Not Specified"], + domains=[], task_subtypes=["Sentiment/Hate speech"], license="not specified", annotations_creators="LM-generated", diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py index fc226f3071..7847ecd768 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringS2S.py @@ -34,7 +34,7 @@ class BlurbsClusteringS2S(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=["Books"], + domains=["Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py index 830ec18014..98deac52e9 100644 --- a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -108,7 +108,7 @@ class FarsiParaphraseDetection(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=["Not Specified"], + domains=[], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -223,7 +223,7 @@ class ParsinluEntail(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=["Reviews"], + domains=["Reviews", "Written"], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -274,7 +274,7 @@ class ParsinluQueryParaphPC(AbsTaskPairClassification): eval_langs=["fas-Arab"], main_score="max_ap", date=("2024-09-01", "2024-12-31"), - domains=["Reviews"], + domains=["Reviews", "Written"], task_subtypes=[], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index 3dcdc3a5da..ea74d1fd34 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -21,7 +21,7 @@ class T2Reranking(AbsTaskReranking): main_score="map", date=None, form=None, - domains=["Not Specified"], + domains=[], task_subtypes=None, license="not specified", annotations_creators=None, diff --git a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py index e2ef49b1b7..af9b80a880 100644 --- a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py @@ -27,7 +27,7 @@ class GermanDPR(AbsTaskRetrieval): main_score="ndcg_at_10", date=None, domains=["Encyclopaedic"], - task_subtypes=["Passage retrieval"], + task_subtypes=[], license="cc-by-4.0", annotations_creators=None, dialect=None, diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index b17ac7ad44..4fb9358a14 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -48,14 +48,11 @@ class T2Retrieval(AbsTaskRetrieval): domains=[ "Medical", "Academic", - "E-commerce", - "General", + "Financial", "Government", - "Emotion", - "Information Technology", - "Vehicle", + "Non-fiction", ], - task_subtypes=["Passage ranking"], + task_subtypes=[], license="apache-2.0", annotations_creators="human-annotated", dialect=None, diff --git a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py index 68ad7062c6..a0552eb4df 100644 --- a/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py +++ b/mteb/tasks/STS/deu/GermanSTSBenchmarkSTS.py @@ -22,7 +22,7 @@ class GermanSTSBenchmarkSTS(AbsTaskSTS): eval_langs=["deu-Latn"], main_score="cosine_spearman", date=("2023-11-09", "2024-01-24"), - domains=["Not Specified"], + domains=[], task_subtypes=None, license="cc-by-sa-3.0", annotations_creators=None, diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py index 1014243c7a..2ce9522cd4 100644 --- a/mteb/tasks/STS/fas/FaMTEBSTS.py +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -21,7 +21,7 @@ class Farsick(AbsTaskSTS): eval_langs=["fas-Arab"], main_score="cosine_spearman", date=("2024-09-01", "2024-12-31"), - domains=["Not Specified"], + domains=[], task_subtypes=[], license="not specified", annotations_creators="derived", @@ -87,7 +87,7 @@ class Query2Query(AbsTaskSTS): eval_langs=["fas-Arab"], main_score="cosine_spearman", date=("2024-09-01", "2024-12-31"), - domains=["Not Specified"], + domains=[], task_subtypes=[], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/STS/zho/CMTEBSTS.py b/mteb/tasks/STS/zho/CMTEBSTS.py index 7e030eeafe..bcc149f937 100644 --- a/mteb/tasks/STS/zho/CMTEBSTS.py +++ b/mteb/tasks/STS/zho/CMTEBSTS.py @@ -196,7 +196,7 @@ class STSB(AbsTaskSTS): eval_langs=["cmn-Hans"], main_score="cosine_spearman", date=None, - domains=["Not Specified"], + domains=[], task_subtypes=None, license=None, annotations_creators=None, From 585e9de3d80b61d4fa38d8ebb1179cce346f1b7a Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Wed, 16 Apr 2025 21:18:12 +0530 Subject: [PATCH 6/8] correct subtypes --- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index 4fb9358a14..ed33c625da 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -201,7 +201,7 @@ class CovidRetrieval(AbsTaskRetrieval): main_score="ndcg_at_10", date=("2022-03-03", "2022-03-18"), domains=["Medical", "Entertainment", "E-commerce"], - task_subtypes=["Passage retrieval"], + task_subtypes=[], license="not specified", annotations_creators="human-annotated", dialect=[], From caff1412ec6f1b0dc7ed4c68dcf83703115a4967 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Wed, 16 Apr 2025 21:30:43 +0530 Subject: [PATCH 7/8] Fix incorrect domains --- mteb/tasks/Classification/fas/FaMTEBClassification.py | 2 +- mteb/tasks/Classification/pol/PolishClassification.py | 2 +- mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py | 2 +- mteb/tasks/Retrieval/zho/CMTEBRetrieval.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py index 6a7836d45b..43c7971429 100644 --- a/mteb/tasks/Classification/fas/FaMTEBClassification.py +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -536,7 +536,7 @@ class PersianTextEmotion(AbsTaskClassification): eval_langs=["fas-Arab"], main_score="accuracy", date=("2024-09-01", "2024-12-31"), - domains=["Emotion"], + domains=[], task_subtypes=["Sentiment/Hate speech"], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/Classification/pol/PolishClassification.py b/mteb/tasks/Classification/pol/PolishClassification.py index 309292d094..7b1148f90b 100644 --- a/mteb/tasks/Classification/pol/PolishClassification.py +++ b/mteb/tasks/Classification/pol/PolishClassification.py @@ -123,7 +123,7 @@ class AllegroReviewsClassification(AbsTaskClassification): eval_langs=["pol-Latn"], main_score="accuracy", date=None, - domains=["E-commerce"], + domains=["Reviews"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py index d47629b5fd..e8407b2429 100644 --- a/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py +++ b/mteb/tasks/Clustering/deu/BlurbsClusteringP2P.py @@ -26,7 +26,7 @@ class BlurbsClusteringP2P(AbsTaskClustering): main_score="v_measure", date=None, form=None, - domains=["Books"], + domains=["Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index ed33c625da..643a414ada 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -200,7 +200,7 @@ class CovidRetrieval(AbsTaskRetrieval): eval_langs=["cmn-Hans"], main_score="ndcg_at_10", date=("2022-03-03", "2022-03-18"), - domains=["Medical", "Entertainment", "E-commerce"], + domains=["Medical", "Entertainment"], task_subtypes=[], license="not specified", annotations_creators="human-annotated", From 59930e98ed52d1d726382301896485fd0ea98fa7 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Thu, 24 Apr 2025 20:12:42 +0800 Subject: [PATCH 8/8] Update TaskMetadata.py --- mteb/abstasks/TaskMetadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 468bb8a584..e06cbd4568 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -80,7 +80,6 @@ "Chemistry", "Financial", "Entertainment", - "Not Specified", ] SAMPLE_CREATION_METHOD = Literal[