From d9a676e4cf3620cf8e073d2425e9f516d28de9a7 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Wed, 7 Jan 2026 16:23:51 +0900 Subject: [PATCH 1/5] add dataset: KoViDoRe v2 --- mteb/benchmarks/benchmarks/__init__.py | 2 + mteb/benchmarks/benchmarks/benchmarks.py | 25 ++ .../KoVidore2CybersecurityRetrieval.json | 32 +++ .../KoVidore2EconomicRetrieval.json | 32 +++ .../KoVidore2EnergyRetrieval.json | 32 +++ .../KoVidore2HrRetrieval.json | 32 +++ mteb/tasks/retrieval/kor/__init__.py | 16 +- .../kor/kovidore2_bench_retrieval.py | 238 ++++++++++++++++++ 8 files changed, 408 insertions(+), 1 deletion(-) create mode 100644 mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json create mode 100644 mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json create mode 100644 mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json create mode 100644 mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json create mode 100644 mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index 3988cd972a..862ecaa66d 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -14,6 +14,7 @@ JINA_VDR, JMTEB_LITE_V1, JMTEB_V2, + KOVIDORE_V2, LONG_EMBED, MIEB_ENG, MIEB_IMG, @@ -79,6 +80,7 @@ "JINA_VDR", "JMTEB_LITE_V1", "JMTEB_V2", + "KOVIDORE_V2", "LONG_EMBED", "MIEB_ENG", "MIEB_IMG", diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index 16a84b3893..bec3fe2e4c 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -2728,3 +2728,28 @@ """, contacts=["lsz05"], ) + +KOVIDORE_V2 = Benchmark( + name="KoViDoRe(v2)", + display_name="KoViDoRe v2", + language_view=["kor-Hang"], + tasks=get_tasks( + tasks=[ + "KoVidore2CybersecurityRetrieval", + "KoVidore2EconomicRetrieval", + "KoVidore2EnergyRetrieval", + "KoVidore2HrRetrieval", + ] + ), + description="KoViDoRe v2 sets a new industry gold standard for multi-modal, enterprise document visual retrieval evaluation. It addresses a critical challenge in production RAG systems: retrieving accurate information from complex, visually-rich documents.", + reference="https://github.com/whybe-choi/kovidore-data-generator", + citation=r""" +@misc{choi2026kovidorev2, + author = {Yongbin Choi}, + title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, + year = {2026}, + url = {https://github.com/whybe-choi/kovidore-data-generator}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} +} +""", +) diff --git a/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json new file mode 100644 index 0000000000..c175631c22 --- /dev/null +++ b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2CybersecurityRetrieval.json @@ -0,0 +1,32 @@ +{ + "test": { + "num_samples": 1299, + "number_of_characters": 9254, + "documents_text_statistics": null, + "documents_image_statistics": { + "min_image_width": 2245, + "average_image_width": 2370.324347826087, + "max_image_width": 3508, + "min_image_height": 2481, + "average_image_height": 3289.8060869565215, + "max_image_height": 3580, + "unique_images": 1132 + }, + "queries_text_statistics": { + "total_text_length": 9254, + "min_text_length": 15, + "average_text_length": 62.10738255033557, + "max_text_length": 108, + "unique_texts": 149 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 409, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.7449664429530203, + "max_relevant_docs_per_query": 7, + "unique_relevant_docs": 316 + }, + "top_ranked_statistics": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json new file mode 100644 index 0000000000..1e64ebda1e --- /dev/null +++ b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EconomicRetrieval.json @@ -0,0 +1,32 @@ +{ + "test": { + "num_samples": 1640, + "number_of_characters": 8331, + "documents_text_statistics": null, + "documents_image_statistics": { + "min_image_width": 2313, + "average_image_width": 2347.5321597833445, + "max_image_width": 2481, + "min_image_height": 3138, + "average_image_height": 3214.301963439404, + "max_image_height": 3508, + "unique_images": 1442 + }, + "queries_text_statistics": { + "total_text_length": 8331, + "min_text_length": 23, + "average_text_length": 51.11042944785276, + "max_text_length": 110, + "unique_texts": 163 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 413, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.5337423312883436, + "max_relevant_docs_per_query": 6, + "unique_relevant_docs": 349 + }, + "top_ranked_statistics": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json new file mode 100644 index 0000000000..3e35d9ce02 --- /dev/null +++ b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2EnergyRetrieval.json @@ -0,0 +1,32 @@ +{ + "test": { + "num_samples": 2101, + "number_of_characters": 10417, + "documents_text_statistics": null, + "documents_image_statistics": { + "min_image_width": 2221, + "average_image_width": 2305.0387231815803, + "max_image_width": 2480, + "min_image_height": 3036, + "average_image_height": 3186.1962323390894, + "max_image_height": 3508, + "unique_images": 1900 + }, + "queries_text_statistics": { + "total_text_length": 10417, + "min_text_length": 22, + "average_text_length": 54.82631578947368, + "max_text_length": 103, + "unique_texts": 189 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 571, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 3.0052631578947366, + "max_relevant_docs_per_query": 7, + "unique_relevant_docs": 464 + }, + "top_ranked_statistics": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json new file mode 100644 index 0000000000..c837283bfb --- /dev/null +++ b/mteb/descriptive_stats/Image/DocumentUnderstanding/KoVidore2HrRetrieval.json @@ -0,0 +1,32 @@ +{ + "test": { + "num_samples": 2330, + "number_of_characters": 13131, + "documents_text_statistics": null, + "documents_image_statistics": { + "min_image_width": 1949, + "average_image_width": 2430.1152204836417, + "max_image_width": 3505, + "min_image_height": 2480, + "average_image_height": 3350.3921289710765, + "max_image_height": 3626, + "unique_images": 2096 + }, + "queries_text_statistics": { + "total_text_length": 13131, + "min_text_length": 21, + "average_text_length": 59.41628959276018, + "max_text_length": 112, + "unique_texts": 221 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 726, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 3.2850678733031673, + "max_relevant_docs_per_query": 7, + "unique_relevant_docs": 575 + }, + "top_ranked_statistics": null + } +} \ No newline at end of file diff --git a/mteb/tasks/retrieval/kor/__init__.py b/mteb/tasks/retrieval/kor/__init__.py index 9f680addc0..ae6254e752 100644 --- a/mteb/tasks/retrieval/kor/__init__.py +++ b/mteb/tasks/retrieval/kor/__init__.py @@ -1,5 +1,19 @@ from .auto_rag_retrieval import AutoRAGRetrieval from .ko_strategy_qa import KoStrategyQA +from .kovidore2_bench_retrieval import ( + KoVidore2CybersecurityRetrieval, + KoVidore2EconomicRetrieval, + KoVidore2EnergyRetrieval, + KoVidore2HrRetrieval, +) from .squad_kor_v1_retrieval import SQuADKorV1Retrieval -__all__ = ["AutoRAGRetrieval", "KoStrategyQA", "SQuADKorV1Retrieval"] +__all__ = [ + "AutoRAGRetrieval", + "KoStrategyQA", + "KoVidore2CybersecurityRetrieval", + "KoVidore2EconomicRetrieval", + "KoVidore2EnergyRetrieval", + "KoVidore2HrRetrieval", + "SQuADKorV1Retrieval", +] diff --git a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py new file mode 100644 index 0000000000..5bd662101d --- /dev/null +++ b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py @@ -0,0 +1,238 @@ +from datasets import load_dataset + +from mteb.abstasks.retrieval import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + + +def _load_data( + path: str, + splits: str, + revision: str | None = None, +): + corpus = {} + queries = {} + relevant_docs = {} + + for split in splits: + query_ds = load_dataset( + path, + "queries", + split=split, + revision=revision, + ) + query_ds = query_ds.map( + lambda x: { + "id": f"query-{split}-{x['query_id']}", + "text": x["query"], + "modality": "text", + }, + remove_columns=["query_id", "query"], + ) + queries[split] = query_ds + + corpus_ds = load_dataset( + path, + "corpus", + split=split, + revision=revision, + ) + corpus_ds = corpus_ds.map( + lambda x: { + "id": f"corpus-{split}-{x['corpus_id']}", + "modality": "image", + }, + remove_columns=["corpus_id"], + ) + corpus[split] = corpus_ds + + qrels_ds = load_dataset( + path, + "qrels", + split=split, + revision=revision, + ) + relevant_docs[split] = {} + for row in qrels_ds: + qid = f"query-{split}-{row['query_id']}" + did = f"corpus-{split}-{row['corpus_id']}" + if qid not in relevant_docs[split]: + relevant_docs[split][qid] = {} + relevant_docs[split][qid][did] = int(row["score"]) + + return corpus, queries, relevant_docs + + +class KoVidore2CybersecurityRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="KoVidore2CybersecurityRetrieval", + description="Retrieve associated pages according to questions. This dataset, Cybersecurity, is a corpus of technical reports on cyber threat trends and security incident responses in Korea, intended for complex-document understanding tasks.", + reference="https://github.com/whybe-choi/kovidore-data-generator", + dataset={ + "path": "whybe-choi/kovidore-v2-cybersecurity-beir", + "revision": "006dcb0e8f63c9736687cb36e725769c903054b0", + }, + type="DocumentUnderstanding", + category="t2i", + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="ndcg_at_10", + date=("2025-12-21", "2026-01-06"), + domains=["Social"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="created", + bibtex_citation=""" +@misc{choi2026kovidorev2, + author = {Yongbin Choi}, + title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, + year = {2026}, + url = {https://github.com/whybe-choi/kovidore-data-generator}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} +} +""", + prompt={"query": "Find a screenshot that is relevant to the user's question."}, + ) + + def load_data(self) -> None: + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + revision=self.metadata.dataset["revision"], + ) + + self.data_loaded = True + + +class KoVidore2EconomicRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="KoVidore2EconomicRetrieval", + description="Retrieve associated pages according to questions. This dataset, Economic trends, is a corpus of periodic reports on major economic indicators in Korea, intended for complex-document understanding tasks.", + reference="https://github.com/whybe-choi/kovidore-data-generator", + dataset={ + "path": "whybe-choi/kovidore-v2-economic-beir", + "revision": "8400656ad1e90e7662d7cda44628eaa2d29ea8d8", + }, + type="DocumentUnderstanding", + category="t2i", + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="ndcg_at_10", + date=("2025-12-21", "2026-01-06"), + domains=["Social"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="created", + bibtex_citation=""" +@misc{choi2026kovidorev2, + author = {Yongbin Choi}, + title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, + year = {2026}, + url = {https://github.com/whybe-choi/kovidore-data-generator}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} +} +""", + prompt={"query": "Find a screenshot that is relevant to the user's question."}, + ) + + def load_data(self) -> None: + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + revision=self.metadata.dataset["revision"], + ) + + self.data_loaded = True + + +class KoVidore2EnergyRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="KoVidore2EnergyRetrieval", + description="Retrieve associated pages according to questions. This dataset, Energy, is a corpus of reports on energy market trends, policy planning, and industry statistics, intended for complex-document understanding tasks.", + reference="https://github.com/whybe-choi/kovidore-data-generator", + dataset={ + "path": "whybe-choi/kovidore-v2-energy-beir", + "revision": "17fea125be86500c0d7891967ca0e4ada14fbe0d", + }, + type="DocumentUnderstanding", + category="t2i", + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="ndcg_at_10", + date=("2025-12-21", "2026-01-06"), + domains=["Social"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="created", + bibtex_citation=""" +@misc{choi2026kovidorev2, + author = {Yongbin Choi}, + title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, + year = {2026}, + url = {https://github.com/whybe-choi/kovidore-data-generator}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} +} +""", + prompt={"query": "Find a screenshot that is relevant to the user's question."}, + ) + + def load_data(self) -> None: + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + revision=self.metadata.dataset["revision"], + ) + + self.data_loaded = True + + +class KoVidore2HrRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="KoVidore2HrRetrieval", + description="Retrieve associated pages according to questions. This dataset, HR, is a corpus of reports on workforce outlook and employment policy in korea, intended for complex-document understanding tasks.", + reference="https://github.com/whybe-choi/kovidore-data-generator", + dataset={ + "path": "whybe-choi/kovidore-v2-hr-beir", + "revision": "0641db2d66968538823af3a847257ee6b813c57e", + }, + type="DocumentUnderstanding", + category="t2i", + eval_splits=["test"], + eval_langs=["kor-Hang"], + main_score="ndcg_at_10", + date=("2025-12-21", "2026-01-06"), + domains=["Social"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="created", + bibtex_citation=""" +@misc{choi2026kovidorev2, + author = {Yongbin Choi}, + title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, + year = {2026}, + url = {https://github.com/whybe-choi/kovidore-data-generator}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} +} +""", + prompt={"query": "Find a screenshot that is relevant to the user's question."}, + ) + + def load_data(self) -> None: + self.corpus, self.queries, self.relevant_docs = _load_data( + path=self.metadata.dataset["path"], + splits=self.metadata.eval_splits, + revision=self.metadata.dataset["revision"], + ) + + self.data_loaded = True From b9b62ec32b3679d17995884624f1ad581f6722ac Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Wed, 7 Jan 2026 16:46:57 +0900 Subject: [PATCH 2/5] fix citation format --- mteb/benchmarks/benchmarks/benchmarks.py | 4 ++-- .../retrieval/kor/kovidore2_bench_retrieval.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index bec3fe2e4c..eb6b8436ea 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -2746,10 +2746,10 @@ citation=r""" @misc{choi2026kovidorev2, author = {Yongbin Choi}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains}, title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, - year = {2026}, url = {https://github.com/whybe-choi/kovidore-data-generator}, - note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} + year = {2026}, } """, ) diff --git a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py index 5bd662101d..ad8e67f6c8 100644 --- a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +++ b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py @@ -87,10 +87,10 @@ class KoVidore2CybersecurityRetrieval(AbsTaskRetrieval): bibtex_citation=""" @misc{choi2026kovidorev2, author = {Yongbin Choi}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains}, title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, - year = {2026}, url = {https://github.com/whybe-choi/kovidore-data-generator}, - note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} + year = {2026}, } """, prompt={"query": "Find a screenshot that is relevant to the user's question."}, @@ -131,10 +131,10 @@ class KoVidore2EconomicRetrieval(AbsTaskRetrieval): bibtex_citation=""" @misc{choi2026kovidorev2, author = {Yongbin Choi}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains}, title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, - year = {2026}, url = {https://github.com/whybe-choi/kovidore-data-generator}, - note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} + year = {2026}, } """, prompt={"query": "Find a screenshot that is relevant to the user's question."}, @@ -175,10 +175,10 @@ class KoVidore2EnergyRetrieval(AbsTaskRetrieval): bibtex_citation=""" @misc{choi2026kovidorev2, author = {Yongbin Choi}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains}, title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, - year = {2026}, url = {https://github.com/whybe-choi/kovidore-data-generator}, - note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} + year = {2026}, } """, prompt={"query": "Find a screenshot that is relevant to the user's question."}, @@ -219,10 +219,10 @@ class KoVidore2HrRetrieval(AbsTaskRetrieval): bibtex_citation=""" @misc{choi2026kovidorev2, author = {Yongbin Choi}, + note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains}, title = {KoViDoRe v2: a comprehensive evaluation of vision document retrieval for enterprise use-cases}, - year = {2026}, url = {https://github.com/whybe-choi/kovidore-data-generator}, - note = {A benchmark for evaluating Korean vision document retrieval with multi-page reasoning queries in practical domains} + year = {2026}, } """, prompt={"query": "Find a screenshot that is relevant to the user's question."}, From 7271c435faf72d2f21322cb8e0474839bf706f08 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Thu, 8 Jan 2026 13:54:15 +0900 Subject: [PATCH 3/5] add direct loading --- .../kor/kovidore2_bench_retrieval.py | 111 ++---------------- 1 file changed, 8 insertions(+), 103 deletions(-) diff --git a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py index ad8e67f6c8..b22e6172af 100644 --- a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +++ b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py @@ -1,75 +1,16 @@ -from datasets import load_dataset from mteb.abstasks.retrieval import AbsTaskRetrieval from mteb.abstasks.task_metadata import TaskMetadata -def _load_data( - path: str, - splits: str, - revision: str | None = None, -): - corpus = {} - queries = {} - relevant_docs = {} - - for split in splits: - query_ds = load_dataset( - path, - "queries", - split=split, - revision=revision, - ) - query_ds = query_ds.map( - lambda x: { - "id": f"query-{split}-{x['query_id']}", - "text": x["query"], - "modality": "text", - }, - remove_columns=["query_id", "query"], - ) - queries[split] = query_ds - - corpus_ds = load_dataset( - path, - "corpus", - split=split, - revision=revision, - ) - corpus_ds = corpus_ds.map( - lambda x: { - "id": f"corpus-{split}-{x['corpus_id']}", - "modality": "image", - }, - remove_columns=["corpus_id"], - ) - corpus[split] = corpus_ds - - qrels_ds = load_dataset( - path, - "qrels", - split=split, - revision=revision, - ) - relevant_docs[split] = {} - for row in qrels_ds: - qid = f"query-{split}-{row['query_id']}" - did = f"corpus-{split}-{row['corpus_id']}" - if qid not in relevant_docs[split]: - relevant_docs[split][qid] = {} - relevant_docs[split][qid][did] = int(row["score"]) - - return corpus, queries, relevant_docs - - class KoVidore2CybersecurityRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="KoVidore2CybersecurityRetrieval", description="Retrieve associated pages according to questions. This dataset, Cybersecurity, is a corpus of technical reports on cyber threat trends and security incident responses in Korea, intended for complex-document understanding tasks.", reference="https://github.com/whybe-choi/kovidore-data-generator", dataset={ - "path": "whybe-choi/kovidore-v2-cybersecurity-beir", - "revision": "006dcb0e8f63c9736687cb36e725769c903054b0", + "path": "whybe-choi/kovidore-v2-cybersecurity-mteb", + "revision": "577d7c45f79d8eb4e7584db3990f91daa7e47956", }, type="DocumentUnderstanding", category="t2i", @@ -96,15 +37,6 @@ class KoVidore2CybersecurityRetrieval(AbsTaskRetrieval): prompt={"query": "Find a screenshot that is relevant to the user's question."}, ) - def load_data(self) -> None: - self.corpus, self.queries, self.relevant_docs = _load_data( - path=self.metadata.dataset["path"], - splits=self.metadata.eval_splits, - revision=self.metadata.dataset["revision"], - ) - - self.data_loaded = True - class KoVidore2EconomicRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -112,8 +44,8 @@ class KoVidore2EconomicRetrieval(AbsTaskRetrieval): description="Retrieve associated pages according to questions. This dataset, Economic trends, is a corpus of periodic reports on major economic indicators in Korea, intended for complex-document understanding tasks.", reference="https://github.com/whybe-choi/kovidore-data-generator", dataset={ - "path": "whybe-choi/kovidore-v2-economic-beir", - "revision": "8400656ad1e90e7662d7cda44628eaa2d29ea8d8", + "path": "whybe-choi/kovidore-v2-economic-mteb", + "revision": "0189c26211290a902cd9d41a0db932808a54c0a8", }, type="DocumentUnderstanding", category="t2i", @@ -140,15 +72,6 @@ class KoVidore2EconomicRetrieval(AbsTaskRetrieval): prompt={"query": "Find a screenshot that is relevant to the user's question."}, ) - def load_data(self) -> None: - self.corpus, self.queries, self.relevant_docs = _load_data( - path=self.metadata.dataset["path"], - splits=self.metadata.eval_splits, - revision=self.metadata.dataset["revision"], - ) - - self.data_loaded = True - class KoVidore2EnergyRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -156,8 +79,8 @@ class KoVidore2EnergyRetrieval(AbsTaskRetrieval): description="Retrieve associated pages according to questions. This dataset, Energy, is a corpus of reports on energy market trends, policy planning, and industry statistics, intended for complex-document understanding tasks.", reference="https://github.com/whybe-choi/kovidore-data-generator", dataset={ - "path": "whybe-choi/kovidore-v2-energy-beir", - "revision": "17fea125be86500c0d7891967ca0e4ada14fbe0d", + "path": "whybe-choi/kovidore-v2-energy-mteb", + "revision": "f967fa70b5cf287d6d39ec16520786cb78e971a4", }, type="DocumentUnderstanding", category="t2i", @@ -184,15 +107,6 @@ class KoVidore2EnergyRetrieval(AbsTaskRetrieval): prompt={"query": "Find a screenshot that is relevant to the user's question."}, ) - def load_data(self) -> None: - self.corpus, self.queries, self.relevant_docs = _load_data( - path=self.metadata.dataset["path"], - splits=self.metadata.eval_splits, - revision=self.metadata.dataset["revision"], - ) - - self.data_loaded = True - class KoVidore2HrRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( @@ -200,8 +114,8 @@ class KoVidore2HrRetrieval(AbsTaskRetrieval): description="Retrieve associated pages according to questions. This dataset, HR, is a corpus of reports on workforce outlook and employment policy in korea, intended for complex-document understanding tasks.", reference="https://github.com/whybe-choi/kovidore-data-generator", dataset={ - "path": "whybe-choi/kovidore-v2-hr-beir", - "revision": "0641db2d66968538823af3a847257ee6b813c57e", + "path": "whybe-choi/kovidore-v2-hr-mteb", + "revision": "d9432c782a9a3e2eed064f6fac08b4c967d92b99", }, type="DocumentUnderstanding", category="t2i", @@ -227,12 +141,3 @@ class KoVidore2HrRetrieval(AbsTaskRetrieval): """, prompt={"query": "Find a screenshot that is relevant to the user's question."}, ) - - def load_data(self) -> None: - self.corpus, self.queries, self.relevant_docs = _load_data( - path=self.metadata.dataset["path"], - splits=self.metadata.eval_splits, - revision=self.metadata.dataset["revision"], - ) - - self.data_loaded = True From cf0cad382427a5cb29e74c4dbfb72824f5954ab6 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Thu, 8 Jan 2026 13:55:32 +0900 Subject: [PATCH 4/5] lint format --- mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py index b22e6172af..1e904a92bf 100644 --- a/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py +++ b/mteb/tasks/retrieval/kor/kovidore2_bench_retrieval.py @@ -1,4 +1,3 @@ - from mteb.abstasks.retrieval import AbsTaskRetrieval from mteb.abstasks.task_metadata import TaskMetadata From fe080f041d42f01c50f79e39f36d63f4f63bf865 Mon Sep 17 00:00:00 2001 From: Yongbin Choi Date: Fri, 9 Jan 2026 13:14:25 +0900 Subject: [PATCH 5/5] delete benchmark language view Co-authored-by: Roman Solomatin --- mteb/benchmarks/benchmarks/benchmarks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index eb6b8436ea..6b06e44122 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -2732,7 +2732,6 @@ KOVIDORE_V2 = Benchmark( name="KoViDoRe(v2)", display_name="KoViDoRe v2", - language_view=["kor-Hang"], tasks=get_tasks( tasks=[ "KoVidore2CybersecurityRetrieval",