From b9fa19230cdc80220d1cf9284bc5e3f5c517e8e1 Mon Sep 17 00:00:00 2001 From: iliass Date: Wed, 23 Jul 2025 17:45:08 +0200 Subject: [PATCH 01/29] chore: add 'Patent retrieval' subtype to TaskMetadata --- mteb/abstasks/TaskMetadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index ac8b52890d..5d801ea8fb 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -21,6 +21,7 @@ TASK_SUBTYPE = Literal[ "Article retrieval", + "Patent retrieval", "Conversational retrieval", "Dialect pairing", "Dialog Systems", From d307718fdc56d7da4c0447c71fec4a06d3646e66 Mon Sep 17 00:00:00 2001 From: iliass Date: Wed, 23 Jul 2025 17:50:47 +0200 Subject: [PATCH 02/29] feat(retrieval): add DAPFAM patent retrieval tasks (+18 variants) --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 220 ++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py new file mode 100644 index 0000000000..430c19a2df --- /dev/null +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -0,0 +1,220 @@ +from __future__ import annotations + +import math + +import numpy as np +from datasets import load_dataset +from sentence_transformers.quantization import quantize_embeddings +from sklearn.metrics import average_precision_score + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from ....abstasks.TaskMetadata import TaskMetadata +from ....load_results.task_results import TaskResult + +HF_REPO = "datalyes/DAPFAM_patent" +REFERENCE = "https://arxiv.org/abs/2506.22141" +BIBTEX = r"""@article{ayaou2025dapfam, + title={DAPFAM: A Domain-Aware Patent Retrieval Dataset Aggregated at the Family Level}, + author={Ayaou, Iliass and Cavallucci, Denis and Chibane, Hicham}, + journal={arXiv preprint arXiv:2506.22141}, + year={2025} +}""" + + +DOMAIN_LABELS = {"ALL": None, "IN": "IN", "OUT": "OUT"} + + +QUERY_VARIANTS = { + "TitleAbstract": ["title_en", "abstract_en"], + "TitleAbstractClaims": ["title_en", "abstract_en", "claims_text"], +} +CORPUS_VARIANTS = { + "TitleAbstract": ["title_en", "abstract_en"], + "TitleAbstractClaims": ["title_en", "abstract_en", "claims_text"], + "TitleAbstractClaimsDescription": [ + "title_en", + "abstract_en", + "claims_text", + "description_en", + ], +} + + +def make_load_data(q_fields, c_fields): + def load_data(self, **kwargs): + # 1) Pull HF splits + ds_c = load_dataset(HF_REPO, "corpus", split="train") + ds_q = load_dataset(HF_REPO, "queries", split="train") + ds_r = load_dataset(HF_REPO, "relations", split="train") + # 2) Build dicts + corpus = { + r["relevant_id"]: "\n".join( + str(r[f]) for f in c_fields if r.get(f) is not None + ) + for r in ds_c + } + queries = { + r["query_id"]: "\n".join( + str(r[f]) for f in q_fields if r.get(f) is not None + ) + for r in ds_q + } + + qrels = {} + for r in ds_r: + qid, pid = r["query_id"], r["relevant_id"] + qrels.setdefault(qid, {})[pid] = ( + float(r["relevance_score"]), + r["domain_rel"], + ) + + self.corpus = {"test": corpus} + self.queries = {"test": queries} + self.relevant_docs = {"test": qrels} + self.data_loaded = True + return self.corpus, self.queries, self.relevant_docs + + return load_data + + +def make_evaluate(domain_filter): + def evaluate( + self, model_wrapper, split="test", subsets_to_run=None, **kwargs + ) -> TaskResult: + if not getattr(self, "data_loaded", False): + self.load_data() + corpus = self.corpus[split] + queries = self.queries[split] + qrels_map = self.relevant_docs[split] + + encode_kwargs = kwargs.get("encode_kwargs", {}) + corp_ids, corp_txts = zip(*corpus.items()) + qry_ids, qry_txts = zip(*queries.items()) + + emb_c = model_wrapper.model.encode( + list(corp_txts), **encode_kwargs, show_progress_bar=True + ) + emb_q = model_wrapper.model.encode( + list(qry_txts), **encode_kwargs, show_progress_bar=True + ) + + # Quantize the embeddings + emb_c = quantize_embeddings(emb_c, precision="uint8") + emb_q = quantize_embeddings(emb_q, precision="uint8") + + emb_c = emb_c / np.linalg.norm(emb_c, axis=1, keepdims=True) + emb_q = emb_q / np.linalg.norm(emb_q, axis=1, keepdims=True) + + sims = emb_q.dot(emb_c.T) + + run_dict = {} + for i, qid in enumerate(qry_ids): + scores = sims[i] + idxs = np.argsort(-scores) + run_dict[qid] = [(corp_ids[j], float(scores[j])) for j in idxs] + + def ndcg_at_k(preds, refset, k): + if not refset: + return 1.0 + gains = [1.0 if pid in refset else 0.0 for pid in preds[:k]] + + def dcg(g): + return sum((2**v - 1) / math.log2(i + 2) for i, v in enumerate(g)) + + ideal = sorted(gains, reverse=True) + idcg = dcg(ideal) + + if idcg <= 0.0: + return 0.0 + return dcg(gains) / idcg + + rec10 = [] + rec100 = [] + ndc10 = [] + ndc100 = [] + map10 = [] + map100 = [] + for qid, ranking in run_dict.items(): + preds = [pid for pid, _ in ranking] + + full = {pid for pid, (s, _) in qrels_map.get(qid, {}).items() if s > 0} + if domain_filter: + relset = { + pid + for pid, (s, dom) in qrels_map[qid].items() + if s > 0 and dom == domain_filter + } + else: + relset = full + + for k, rec_list in ((10, rec10), (100, rec100)): + hits = len(set(preds[:k]) & relset) + rec = hits / len(relset) if relset else 1.0 + rec_list.append(rec) + + ndc10.append(ndcg_at_k(preds, relset, 10)) + ndc100.append(ndcg_at_k(preds, relset, 100)) + + for k, map_list in ((10, map10), (100, map100)): + topk = preds[:k] + y_true = [1 if pid in relset else 0 for pid in topk] + if sum(y_true) == 0: + ap = 1.0 if not relset else 0.0 + else: + y_scores = [k - i for i in range(k)] + ap = average_precision_score(y_true, y_scores) + map_list.append(ap) + + # 6) aggregate macro-averages + metrics = { + "recall@10": float(np.mean(rec10)), + "recall@100": float(np.mean(rec100)), + "ndcg@10": float(np.mean(ndc10)), + "ndcg@100": float(np.mean(ndc100)), + "map@10": float(np.mean(map10)), + "map@100": float(np.mean(map100)), + "main_score": float(np.mean(ndc10)), + } + + return {"default": metrics} + + return evaluate + + +# ——— register all tasks ——— +for domain, domlbl in DOMAIN_LABELS.items(): + for qn, qf in QUERY_VARIANTS.items(): + for cn, cf in CORPUS_VARIANTS.items(): + task_name = f"Dapfam_{domain}_{qn}_{cn}" + metadata = TaskMetadata( + name=task_name, + description=f"DAPFAM [{domain}] Q={qn} / C={cn}", + dataset={"path": HF_REPO, "revision": "main"}, + reference=REFERENCE, + type="Retrieval", + category="p2p", + task_subtypes=["Patent retrieval", "Article retrieval"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg@10", + date=("2025-06-30", "2025-06-30"), + domains=["Engineering", "Chemistry", "Legal"], + license="not specified", + annotations_creators="derived", + sample_creation="created", + judged_docs_only_flag=False, + bibtex_citation=BIBTEX, + ) + + def __init__(self): + super(self.__class__, self).__init__() + self.load_data() + self.calculate_metadata_metrics() + + attrs = { + "__init__": __init__, + "metadata": metadata, + "load_data": make_load_data(qf, cf), + "evaluate": make_evaluate(domlbl), + } + globals()[task_name] = type(task_name, (AbsTaskRetrieval,), attrs) From fbb72363001d6edc541fada4c6929371ad01a51b Mon Sep 17 00:00:00 2001 From: iliass Date: Thu, 24 Jul 2025 00:38:21 +0200 Subject: [PATCH 03/29] Dapfam patent retrieval PR #2946 : refactor DAPFAM tasks (explicit classes, license, metadata, custom definition explanation ...) --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 406 +++++++++++++----- 1 file changed, 301 insertions(+), 105 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 430c19a2df..09a7c312f2 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -1,6 +1,8 @@ +# mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py from __future__ import annotations import math +from typing import Dict, List, Optional, Tuple import numpy as np from datasets import load_dataset @@ -9,26 +11,45 @@ from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval from ....abstasks.TaskMetadata import TaskMetadata -from ....load_results.task_results import TaskResult +# ─────────────────────────────────────────────────── +# GLOBAL CONSTANTS HF_REPO = "datalyes/DAPFAM_patent" REFERENCE = "https://arxiv.org/abs/2506.22141" -BIBTEX = r"""@article{ayaou2025dapfam, - title={DAPFAM: A Domain-Aware Patent Retrieval Dataset Aggregated at the Family Level}, - author={Ayaou, Iliass and Cavallucci, Denis and Chibane, Hicham}, - journal={arXiv preprint arXiv:2506.22141}, - year={2025} +BIBTEX = r"""@misc{ayaou2025dapfam, + title = {DAPFAM: A Domain-Aware Patent Retrieval Dataset Aggregated at the Family Level}, + author = {Ayaou, Iliass and Cavallucci, Denis and Chibane, Hicham}, + year = {2025}, + eprint = {2506.22141}, + archivePrefix= {arXiv}, + primaryClass = {cs.CL} }""" - -DOMAIN_LABELS = {"ALL": None, "IN": "IN", "OUT": "OUT"} - - -QUERY_VARIANTS = { +# shared metadata +_DEFAULT_META = dict( + dataset={"path": HF_REPO, "revision": "main"}, + reference=REFERENCE, + type="Retrieval", + category="p2p", + task_subtypes=["Article retrieval", "Patent retrieval"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg@10", + date=("1964-06-26", "2023-06-20"), # dataset card coverage + domains=["Engineering", "Chemistry", "Legal"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + sample_creation="created", + # judged_docs_only_flag = False, + bibtex_citation=BIBTEX, +) + +# text-field dictionaries +_QUERY_FIELDS = { "TitleAbstract": ["title_en", "abstract_en"], "TitleAbstractClaims": ["title_en", "abstract_en", "claims_text"], } -CORPUS_VARIANTS = { +_CORPUS_FIELDS = { "TitleAbstract": ["title_en", "abstract_en"], "TitleAbstractClaims": ["title_en", "abstract_en", "claims_text"], "TitleAbstractClaimsDescription": [ @@ -39,81 +60,106 @@ ], } +# paper variants used in Table 4 +_IN_PAPER = { + ("TitleAbstract", "TitleAbstractClaims"), + ("TitleAbstractClaims", "TitleAbstractClaims"), +} + + +# ─────────────────────────────────────────────────── +# MIX-IN with shared logic + metric implementation +class _DAPFAMMixin: + # class-level attributes are filled in each concrete subclass + domain_filter: Optional[str] = None + query_fields: List[str] = [] + corpus_fields: List[str] = [] + in_paper: bool = False -def make_load_data(q_fields, c_fields): - def load_data(self, **kwargs): - # 1) Pull HF splits + # ------------ data loading (identical for all variants) ------------ + def load_data(self, **_) -> Tuple[Dict, Dict, Dict]: ds_c = load_dataset(HF_REPO, "corpus", split="train") ds_q = load_dataset(HF_REPO, "queries", split="train") ds_r = load_dataset(HF_REPO, "relations", split="train") - # 2) Build dicts - corpus = { - r["relevant_id"]: "\n".join( - str(r[f]) for f in c_fields if r.get(f) is not None - ) - for r in ds_c + + self.corpus = { + "test": { + r["relevant_id"]: "\n".join( + str(r[f]) for f in self.corpus_fields if r.get(f) + ) + for r in ds_c + } } - queries = { - r["query_id"]: "\n".join( - str(r[f]) for f in q_fields if r.get(f) is not None - ) - for r in ds_q + self.queries = { + "test": { + r["query_id"]: "\n".join( + str(r[f]) for f in self.query_fields if r.get(f) + ) + for r in ds_q + } } - qrels = {} + qrels: Dict[str, Dict[str, Tuple[float, str]]] = {} for r in ds_r: qid, pid = r["query_id"], r["relevant_id"] qrels.setdefault(qid, {})[pid] = ( float(r["relevance_score"]), r["domain_rel"], ) - - self.corpus = {"test": corpus} - self.queries = {"test": queries} + # 4) Assign for MTEB self.relevant_docs = {"test": qrels} self.data_loaded = True return self.corpus, self.queries, self.relevant_docs - return load_data - - -def make_evaluate(domain_filter): - def evaluate( - self, model_wrapper, split="test", subsets_to_run=None, **kwargs - ) -> TaskResult: + # ------------ evaluation (faithful to the paper) ------------ + def _dapfam_evaluate( + self, + model_wrapper, + split: str = "test", + subsets_to_run=None, + **kwargs, + ) -> Dict[str, Dict[str, float]]: + """Custom evaluation that quantises embeddings to uint8 before + normalisation (per the paper) and + computes recall / nDCG / mAP exactly like the paper. + It is fully deterministic. + """ if not getattr(self, "data_loaded", False): self.load_data() + corpus = self.corpus[split] queries = self.queries[split] - qrels_map = self.relevant_docs[split] + qrels = self.relevant_docs[split] - encode_kwargs = kwargs.get("encode_kwargs", {}) - corp_ids, corp_txts = zip(*corpus.items()) - qry_ids, qry_txts = zip(*queries.items()) + corp_ids, corp_texts = zip(*corpus.items()) + qry_ids, qry_texts = zip(*queries.items()) + encode_kwargs = kwargs.get("encode_kwargs", {}) emb_c = model_wrapper.model.encode( - list(corp_txts), **encode_kwargs, show_progress_bar=True + list(corp_texts), **encode_kwargs, show_progress_bar=True ) emb_q = model_wrapper.model.encode( - list(qry_txts), **encode_kwargs, show_progress_bar=True + list(qry_texts), **encode_kwargs, show_progress_bar=True ) - # Quantize the embeddings + # uint8 quantisation (per paper) emb_c = quantize_embeddings(emb_c, precision="uint8") emb_q = quantize_embeddings(emb_q, precision="uint8") + # cosine similarity emb_c = emb_c / np.linalg.norm(emb_c, axis=1, keepdims=True) emb_q = emb_q / np.linalg.norm(emb_q, axis=1, keepdims=True) + sims = emb_q @ emb_c.T - sims = emb_q.dot(emb_c.T) - - run_dict = {} + # ranking per query Dict[str, List[str]] + run: Dict[str, List[str]] = {} for i, qid in enumerate(qry_ids): scores = sims[i] idxs = np.argsort(-scores) - run_dict[qid] = [(corp_ids[j], float(scores[j])) for j in idxs] + run[qid] = [(corp_ids[j], float(scores[j])) for j in idxs] - def ndcg_at_k(preds, refset, k): + # ---- metric helpers ---- + def ndcg_at_k(preds: List[str], refset: set[str], k: int) -> float: if not refset: return 1.0 gains = [1.0 if pid in refset else 0.0 for pid in preds[:k]] @@ -123,7 +169,7 @@ def dcg(g): ideal = sorted(gains, reverse=True) idcg = dcg(ideal) - + # if ideal DCG is zero, return zero per paper if idcg <= 0.0: return 0.0 return dcg(gains) / idcg @@ -134,87 +180,237 @@ def dcg(g): ndc100 = [] map10 = [] map100 = [] - for qid, ranking in run_dict.items(): - preds = [pid for pid, _ in ranking] - full = {pid for pid, (s, _) in qrels_map.get(qid, {}).items() if s > 0} - if domain_filter: + for qid, ranking in run.items(): + preds = [pid for pid, _ in ranking] + full = {d for d, (s, _) in qrels.get(qid, {}).items() if s > 0} + if self.domain_filter: relset = { pid - for pid, (s, dom) in qrels_map[qid].items() - if s > 0 and dom == domain_filter + for pid, (s, dom) in qrels[qid].items() + if s > 0 and dom == self.domain_filter } else: relset = full + # recall@K for k, rec_list in ((10, rec10), (100, rec100)): hits = len(set(preds[:k]) & relset) rec = hits / len(relset) if relset else 1.0 rec_list.append(rec) + # nDCG@K ndc10.append(ndcg_at_k(preds, relset, 10)) ndc100.append(ndcg_at_k(preds, relset, 100)) + # mAP@K via rank-based scores over *top-K* only, per paper for k, map_list in ((10, map10), (100, map100)): + # build binary truth for the top-k topk = preds[:k] y_true = [1 if pid in relset else 0 for pid in topk] + # if no positives exist, perfect; else zero if none in top-k if sum(y_true) == 0: ap = 1.0 if not relset else 0.0 else: + # rank‐based scores k, k−1, …, 1 y_scores = [k - i for i in range(k)] ap = average_precision_score(y_true, y_scores) map_list.append(ap) - # 6) aggregate macro-averages - metrics = { - "recall@10": float(np.mean(rec10)), - "recall@100": float(np.mean(rec100)), - "ndcg@10": float(np.mean(ndc10)), - "ndcg@100": float(np.mean(ndc100)), - "map@10": float(np.mean(map10)), - "map@100": float(np.mean(map100)), - "main_score": float(np.mean(ndc10)), + return { + "default": { + "recall@10": float(np.mean(rec10)), + "recall@100": float(np.mean(rec100)), + "ndcg@10": float(np.mean(ndc10)), + "ndcg@100": float(np.mean(ndc100)), + "map@10": float(np.mean(map10)), + "map@100": float(np.mean(map100)), + "main_score": float(np.mean(ndc10)), + } } - return {"default": metrics} - - return evaluate - - -# ——— register all tasks ——— -for domain, domlbl in DOMAIN_LABELS.items(): - for qn, qf in QUERY_VARIANTS.items(): - for cn, cf in CORPUS_VARIANTS.items(): - task_name = f"Dapfam_{domain}_{qn}_{cn}" - metadata = TaskMetadata( - name=task_name, - description=f"DAPFAM [{domain}] Q={qn} / C={cn}", - dataset={"path": HF_REPO, "revision": "main"}, - reference=REFERENCE, - type="Retrieval", - category="p2p", - task_subtypes=["Patent retrieval", "Article retrieval"], - eval_splits=["test"], - eval_langs=["eng-Latn"], - main_score="ndcg@10", - date=("2025-06-30", "2025-06-30"), - domains=["Engineering", "Chemistry", "Legal"], - license="not specified", - annotations_creators="derived", - sample_creation="created", - judged_docs_only_flag=False, - bibtex_citation=BIBTEX, - ) + def evaluate( + self, + model_wrapper, + split: str = "test", + subsets_to_run=None, + **kwargs, + ) -> Dict[str, Dict[str, float]]: + return self._dapfam_evaluate(model_wrapper, split, subsets_to_run, **kwargs) + + +# ─────────────────────────────────────────────────── +# helper to build TaskMetadata +def _meta(name: str, desc: str) -> TaskMetadata: + return TaskMetadata(name=name, description=desc, **_DEFAULT_META) - def __init__(self): - super(self.__class__, self).__init__() - self.load_data() - self.calculate_metadata_metrics() - attrs = { - "__init__": __init__, - "metadata": metadata, - "load_data": make_load_data(qf, cf), - "evaluate": make_evaluate(domlbl), - } - globals()[task_name] = type(task_name, (AbsTaskRetrieval,), attrs) +# ─────────────────────────────────────────────────── +# 18 explicit task classes (no loops) + +# NOTE: Each class only sets class-level attributes + +# AbsTaskRetrieval.__init__ will call self.load_data() and compute metadata automatically. + + +# ---------- ALL domain ---------- +class Dapfam_ALL_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA") + + +class Dapfam_ALL_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA+Claims (paper)") + + +class Dapfam_ALL_TitleAbstract_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA+Claims+Desc") + + +class Dapfam_ALL_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA") + + +class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaims( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA+Claims (paper)") + + +class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = None + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA+Claims+Desc") + + +# ---------- IN domain ---------- +class Dapfam_IN_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA") + + +class Dapfam_IN_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA+Claims (paper)") + + +class Dapfam_IN_TitleAbstract_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA+Claims+Desc") + + +class Dapfam_IN_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA") + + +class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA+Claims (paper)") + + +class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA+Claims+Desc") + + +# ---------- OUT domain ---------- +class Dapfam_OUT_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA") + + +class Dapfam_OUT_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA+Claims (paper)") + + +class Dapfam_OUT_TitleAbstract_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA+Claims+Desc") + + +class Dapfam_OUT_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + in_paper = False + metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA") + + +class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaims( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + in_paper = True + metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA+Claims (paper)") + + +class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaimsDescription( + _DAPFAMMixin, AbsTaskRetrieval +): + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False + metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA+Claims+Desc") From dc1271adb237a2f9a367aab66e6b945dffda4736 Mon Sep 17 00:00:00 2001 From: iliass Date: Thu, 24 Jul 2025 00:41:46 +0200 Subject: [PATCH 04/29] Dapfam patent retrieval PR #2946 : refactor DAPFAM tasks (explicit classes, license, metadata, custom definition explanation ...) --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 09a7c312f2..b35dc00cb3 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -40,7 +40,6 @@ license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="created", - # judged_docs_only_flag = False, bibtex_citation=BIBTEX, ) From 39f70990467be8b6da87dddb8a41be78274a2b1e Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:08:17 +0200 Subject: [PATCH 05/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b35dc00cb3..a0019c02fb 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -12,8 +12,6 @@ from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval from ....abstasks.TaskMetadata import TaskMetadata -# ─────────────────────────────────────────────────── -# GLOBAL CONSTANTS HF_REPO = "datalyes/DAPFAM_patent" REFERENCE = "https://arxiv.org/abs/2506.22141" BIBTEX = r"""@misc{ayaou2025dapfam, From 2935c9f04c535e68a5911e852010421aee5f4410 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:08:25 +0200 Subject: [PATCH 06/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index a0019c02fb..166dc12a3d 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -1,4 +1,3 @@ -# mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py from __future__ import annotations import math From 1dd4ee506cf76f92c61017edd0855a92d7a5cd84 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:08:36 +0200 Subject: [PATCH 07/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 166dc12a3d..51de3d4005 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -22,8 +22,7 @@ primaryClass = {cs.CL} }""" -# shared metadata -_DEFAULT_META = dict( +_SHARED_METADATA = dict( dataset={"path": HF_REPO, "revision": "main"}, reference=REFERENCE, type="Retrieval", From 84a92b1cbd6d0d7bf6866f4270ab2ec057c39c4b Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:08:48 +0200 Subject: [PATCH 08/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 51de3d4005..1767e2626c 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -71,7 +71,6 @@ class _DAPFAMMixin: corpus_fields: List[str] = [] in_paper: bool = False - # ------------ data loading (identical for all variants) ------------ def load_data(self, **_) -> Tuple[Dict, Dict, Dict]: ds_c = load_dataset(HF_REPO, "corpus", split="train") ds_q = load_dataset(HF_REPO, "queries", split="train") From efd5e2a57492dfd619f0d7af9e137f5d8b8cb0e0 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:08:58 +0200 Subject: [PATCH 09/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 1767e2626c..83a68e1cd7 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -105,7 +105,6 @@ def load_data(self, **_) -> Tuple[Dict, Dict, Dict]: self.data_loaded = True return self.corpus, self.queries, self.relevant_docs - # ------------ evaluation (faithful to the paper) ------------ def _dapfam_evaluate( self, model_wrapper, From fd5b02c9b4636c0af265265c09c0289876f68644 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 05:09:14 +0200 Subject: [PATCH 10/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 83a68e1cd7..6308a66f6a 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -238,15 +238,6 @@ def _meta(name: str, desc: str) -> TaskMetadata: return TaskMetadata(name=name, description=desc, **_DEFAULT_META) -# ─────────────────────────────────────────────────── -# 18 explicit task classes (no loops) - -# NOTE: Each class only sets class-level attributes - -# AbsTaskRetrieval.__init__ will call self.load_data() and compute metadata automatically. - - -# ---------- ALL domain ---------- class Dapfam_ALL_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstract"] From 77f7f9f1c1a3daefbcf8b5a9f80eb08e5758a1c9 Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 25 Jul 2025 06:40:10 +0200 Subject: [PATCH 11/29] Changes : - Added possibility to opt in or out of quantization through the "quantize" argument. - Added possibility to compute raw dot product without normalization. (to reproduce the paper method the "similarity" argument should be "cosine"). - Removed unecessary function and overhauled the tasks descriptions to be more clear. --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 210 +++++++++++++++--- 1 file changed, 181 insertions(+), 29 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 6308a66f6a..4052468ceb 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -114,7 +114,7 @@ def _dapfam_evaluate( ) -> Dict[str, Dict[str, float]]: """Custom evaluation that quantises embeddings to uint8 before normalisation (per the paper) and - computes recall / nDCG / mAP exactly like the paper. + computes recall / nDCG / mAP exactly like the paper if quantize=True and similarity=cosine. It is fully deterministic. """ if not getattr(self, "data_loaded", False): @@ -128,6 +128,9 @@ def _dapfam_evaluate( qry_ids, qry_texts = zip(*queries.items()) encode_kwargs = kwargs.get("encode_kwargs", {}) + quantize = kwargs.get("quantize", True) + similarity = kwargs.get("similarity", "cosine") + emb_c = model_wrapper.model.encode( list(corp_texts), **encode_kwargs, show_progress_bar=True ) @@ -136,12 +139,15 @@ def _dapfam_evaluate( ) # uint8 quantisation (per paper) - emb_c = quantize_embeddings(emb_c, precision="uint8") - emb_q = quantize_embeddings(emb_q, precision="uint8") + if quantize: + emb_c = quantize_embeddings(emb_c, precision="uint8") + emb_q = quantize_embeddings(emb_q, precision="uint8") + + # cosine similarity (to reproduce paper) + if similarity == "cosine": + emb_c = emb_c / np.linalg.norm(emb_c, axis=1, keepdims=True) + emb_q = emb_q / np.linalg.norm(emb_q, axis=1, keepdims=True) - # cosine similarity - emb_c = emb_c / np.linalg.norm(emb_c, axis=1, keepdims=True) - emb_q = emb_q / np.linalg.norm(emb_q, axis=1, keepdims=True) sims = emb_q @ emb_c.T # ranking per query Dict[str, List[str]] @@ -233,9 +239,7 @@ def evaluate( # ─────────────────────────────────────────────────── -# helper to build TaskMetadata -def _meta(name: str, desc: str) -> TaskMetadata: - return TaskMetadata(name=name, description=desc, **_DEFAULT_META) +# ALL domains (no IPC filtering) Tasks class Dapfam_ALL_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): @@ -243,7 +247,15 @@ class Dapfam_ALL_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval: no domain filtering. " + "Queries use title + abstract; corpus uses title + abstract. " + "Goal: retrieve citation-linked patent families across all IPC codes " + ), + **_SHARED_METADATA, + ) class Dapfam_ALL_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): @@ -251,7 +263,15 @@ class Dapfam_ALL_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieva query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval (paper variant): no domain filtering. " + "Queries use title + abstract; corpus adds claims. " + "Goal: leverage claims text to retrieve citation-linked patent families across all IPC codes " + ), + **_SHARED_METADATA, + ) class Dapfam_ALL_TitleAbstract_TitleAbstractClaimsDescription( @@ -261,7 +281,15 @@ class Dapfam_ALL_TitleAbstract_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "ALL • Query: TA | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval: no domain filtering. " + "Queries use title + abstract; corpus uses title, abstract, claims, and description. " + "Goal: evaluate full-text retrieval across all IPC codes. " + ), + **_SHARED_METADATA, + ) class Dapfam_ALL_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): @@ -269,7 +297,15 @@ class Dapfam_ALL_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieva query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval: no domain filtering. " + "Queries use title, abstract, and claims; corpus uses title + abstract. " + "Goal: assess claim-augmented queries against surface-text patent family corpus. " + ), + **_SHARED_METADATA, + ) class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaims( @@ -279,7 +315,15 @@ class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaims( query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval : no domain filtering. " + "Both queries and corpus use title, abstract, and claims. " + "Goal: reproduce the paper’s full-claims setup across all IPC codes. " + ), + **_SHARED_METADATA, + ) class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaimsDescription( @@ -289,16 +333,34 @@ class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "ALL • Query: TA+Claims | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "All-domain retrieval: no domain filtering. " + "Queries use title, abstract, and claims; corpus adds description. " + "Goal: evaluate complete-text patent family retrieval across all IPC codes. " + ), + **_SHARED_METADATA, + ) + + +# IN-domain (≥1 shared IPC top-three code) Tasks -# ---------- IN domain ---------- class Dapfam_IN_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Queries use title + abstract; corpus uses title + abstract. " + "Goal: retrieve citation-linked patents within the same domain " + ), + **_SHARED_METADATA, + ) class Dapfam_IN_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): @@ -306,7 +368,15 @@ class Dapfam_IN_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Queries use title + abstract; corpus adds claims." + "Goal: leverage claims for in-domain patent retrieval." + ), + **_SHARED_METADATA, + ) class Dapfam_IN_TitleAbstract_TitleAbstractClaimsDescription( @@ -316,7 +386,15 @@ class Dapfam_IN_TitleAbstract_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "IN • Query: TA | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Queries use title + abstract; corpus uses title, abstract, claims, and description. " + "Goal: evaluate extended-text in-domain retrieval." + ), + **_SHARED_METADATA, + ) class Dapfam_IN_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): @@ -324,7 +402,15 @@ class Dapfam_IN_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Queries use title, abstract, and claims; corpus uses title + abstract. " + "Goal: assess claim-driven in-domain queries." + ), + **_SHARED_METADATA, + ) class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): @@ -332,7 +418,15 @@ class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRet query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Both queries and corpus use title, abstract, and claims. " + "Goal: reproduce the in-domain full-claims setup." + ), + **_SHARED_METADATA, + ) class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaimsDescription( @@ -342,16 +436,34 @@ class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "IN • Query: TA+Claims | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "In-domain retrieval: query and target share at least one IPC top-three code." + "Queries use title, abstract, and claims; corpus adds description. " + "Goal: evaluate complete-text in-domain retrieval." + ), + **_SHARED_METADATA, + ) + + +# OUT-of-domain (no IPC top-three overlap) Tasks -# ---------- OUT domain ---------- class Dapfam_OUT_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Queries use title + abstract; corpus uses title + abstract. " + "Goal: retrieve citation-linked patents across different IPC domains." + ), + **_SHARED_METADATA, + ) class Dapfam_OUT_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): @@ -359,7 +471,15 @@ class Dapfam_OUT_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieva query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Queries use title + abstract; corpus adds claims. " + "Goal: leverage claims for cross-domain patent retrieval." + ), + **_SHARED_METADATA, + ) class Dapfam_OUT_TitleAbstract_TitleAbstractClaimsDescription( @@ -369,7 +489,15 @@ class Dapfam_OUT_TitleAbstract_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "OUT • Query: TA | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Queries use title + abstract; corpus uses title, abstract, claims, and description. " + "Goal: evaluate extended-text cross-domain retrieval." + ), + **_SHARED_METADATA, + ) class Dapfam_OUT_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): @@ -377,7 +505,15 @@ class Dapfam_OUT_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieva query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False - metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Queries use title, abstract, and claims; corpus uses title + abstract. " + "Goal: assess claim-driven cross-domain queries." + ), + **_SHARED_METADATA, + ) class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaims( @@ -387,7 +523,15 @@ class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaims( query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True - metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA+Claims (paper)") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Both queries and corpus use title, abstract, and claims. " + "Goal: reproduce the out-of-domain full-claims setup." + ), + **_SHARED_METADATA, + ) class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaimsDescription( @@ -397,4 +541,12 @@ class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaimsDescription( query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False - metadata = _meta(__qualname__, "OUT • Query: TA+Claims | Corpus: TA+Claims+Desc") + metadata = TaskMetadata( + name=__qualname__, + description=( + "Out-of-domain retrieval: query and target share no IPC top-three codes. " + "Queries use title, abstract, and claims; corpus adds description. " + "Goal: evaluate complete-text cross-domain retrieval." + ), + **_SHARED_METADATA, + ) From ca526af2b68443bee8a6b3f5731781b4a3bcd9e2 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 25 Jul 2025 10:14:27 +0200 Subject: [PATCH 12/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 4052468ceb..480fa9a871 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -62,7 +62,6 @@ } -# ─────────────────────────────────────────────────── # MIX-IN with shared logic + metric implementation class _DAPFAMMixin: # class-level attributes are filled in each concrete subclass From 8467fb6cc9139870ac6c2b01edd15483ec1b6498 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 10:45:52 +0200 Subject: [PATCH 13/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 480fa9a871..b99229f7ec 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -536,10 +536,10 @@ class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaims( class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaimsDescription( _DAPFAMMixin, AbsTaskRetrieval ): - domain_filter = "OUT" + # In the paper + domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] - in_paper = False metadata = TaskMetadata( name=__qualname__, description=( From 3155cc66d39f0f2d90319d3fa41a04abaef0998c Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 10:46:07 +0200 Subject: [PATCH 14/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b99229f7ec..a7b49779c4 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -31,7 +31,7 @@ eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg@10", - date=("1964-06-26", "2023-06-20"), # dataset card coverage + date=("1964-06-26", "2023-06-20"), domains=["Engineering", "Chemistry", "Legal"], license="cc-by-nc-sa-4.0", annotations_creators="derived", From b597b445e9dd4a2453112460ae79175b2950ee64 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Fri, 25 Jul 2025 10:46:50 +0200 Subject: [PATCH 15/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Kenneth Enevoldsen --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index a7b49779c4..9274ebf3f7 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -1,7 +1,6 @@ from __future__ import annotations import math -from typing import Dict, List, Optional, Tuple import numpy as np from datasets import load_dataset From c6f771da9fbde46b53ccc44d61f8b7ae7187755e Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 25 Jul 2025 17:43:57 +0200 Subject: [PATCH 16/29] Changes made : - Overhauled task descriptions as well as naming to conform with the naming scheme of mteb retrieval tasks. - Similarity is now computed using the similarity function of the passed model. - Changed optional quantization method to conform with sentence transformers similarity function. to reproduce the paper metrics, one can use the following snippet : ```python from mteb import mteb from sentence_transformers import SentenceTransformer model_name = "Snowflake/snowflake-arctic-embed-m-v2.0" model = SentenceTransformer(model_name, model_kwargs={ "torch_dtype": "float16", }, trust_remote_code=True, ).cuda().eval() tasks = mteb.get_tasks(tasks=[ "DAPFAMInTitlAbsToTitlAbsClmRetrieval", "DAPFAMAllTitlAbsToTitlAbsClmRetrieval", "DAPFAMOutTitlAbsToTitlAbsClmRetrieval", add the other 3 remaining tasks ... ]) evaluation = mteb.MTEB(tasks=tasks) results = evaluation.run( model, output_folder=f"mteb_res/{model_name}", quantize=True, # if set to false or not set, the obtained ndcg@10 and map@10 will be ~0.001 higher encode_kwargs= {"batch_size" : 32} ) ``` --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 319 ++++++++++-------- 1 file changed, 184 insertions(+), 135 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 9274ebf3f7..b89168e095 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -64,12 +64,12 @@ # MIX-IN with shared logic + metric implementation class _DAPFAMMixin: # class-level attributes are filled in each concrete subclass - domain_filter: Optional[str] = None - query_fields: List[str] = [] - corpus_fields: List[str] = [] + domain_filter: str | None = None + query_fields: list[str] = [] + corpus_fields: list[str] = [] in_paper: bool = False - def load_data(self, **_) -> Tuple[Dict, Dict, Dict]: + def load_data(self, **_) -> tuple[dict, dict, dict]: ds_c = load_dataset(HF_REPO, "corpus", split="train") ds_q = load_dataset(HF_REPO, "queries", split="train") ds_r = load_dataset(HF_REPO, "relations", split="train") @@ -91,7 +91,7 @@ def load_data(self, **_) -> Tuple[Dict, Dict, Dict]: } } - qrels: Dict[str, Dict[str, Tuple[float, str]]] = {} + qrels: dict[str, dict[str, tuple[float, str]]] = {} for r in ds_r: qid, pid = r["query_id"], r["relevant_id"] qrels.setdefault(qid, {})[pid] = ( @@ -109,7 +109,7 @@ def _dapfam_evaluate( split: str = "test", subsets_to_run=None, **kwargs, - ) -> Dict[str, Dict[str, float]]: + ) -> dict[str, dict[str, float]]: """Custom evaluation that quantises embeddings to uint8 before normalisation (per the paper) and computes recall / nDCG / mAP exactly like the paper if quantize=True and similarity=cosine. @@ -127,8 +127,9 @@ def _dapfam_evaluate( encode_kwargs = kwargs.get("encode_kwargs", {}) quantize = kwargs.get("quantize", True) - similarity = kwargs.get("similarity", "cosine") + # check similarity function name : + print(model_wrapper.model.similarity_fn_name) emb_c = model_wrapper.model.encode( list(corp_texts), **encode_kwargs, show_progress_bar=True ) @@ -136,27 +137,25 @@ def _dapfam_evaluate( list(qry_texts), **encode_kwargs, show_progress_bar=True ) - # uint8 quantisation (per paper) + # uint8 quantisation (per paper) if chosen then we go back to fp32 to avoid error + # by sentence transformers similarity function (doesn't accept quantized embeddings) if quantize: - emb_c = quantize_embeddings(emb_c, precision="uint8") - emb_q = quantize_embeddings(emb_q, precision="uint8") + emb_c_q = quantize_embeddings(emb_c, precision="uint8") + emb_q_q = quantize_embeddings(emb_q, precision="uint8") + emb_c = emb_c_q.astype(np.float32) + emb_q = emb_q_q.astype(np.float32) - # cosine similarity (to reproduce paper) - if similarity == "cosine": - emb_c = emb_c / np.linalg.norm(emb_c, axis=1, keepdims=True) - emb_q = emb_q / np.linalg.norm(emb_q, axis=1, keepdims=True) + sims = model_wrapper.model.similarity(emb_q, emb_c).cpu().numpy() - sims = emb_q @ emb_c.T - - # ranking per query Dict[str, List[str]] - run: Dict[str, List[str]] = {} + # ranking per query dict[str, list[str]] + run: dict[str, list[str]] = {} for i, qid in enumerate(qry_ids): scores = sims[i] idxs = np.argsort(-scores) run[qid] = [(corp_ids[j], float(scores[j])) for j in idxs] # ---- metric helpers ---- - def ndcg_at_k(preds: List[str], refset: set[str], k: int) -> float: + def ndcg_at_k(preds: list[str], refset: set[str], k: int) -> float: if not refset: return 1.0 gains = [1.0 if pid in refset else 0.0 for pid in preds[:k]] @@ -232,319 +231,369 @@ def evaluate( split: str = "test", subsets_to_run=None, **kwargs, - ) -> Dict[str, Dict[str, float]]: + ) -> dict[str, dict[str, float]]: return self._dapfam_evaluate(model_wrapper, split, subsets_to_run, **kwargs) # ─────────────────────────────────────────────────── -# ALL domains (no IPC filtering) Tasks +# DAPFAM Patent Family Retrieval Tasks -class Dapfam_ALL_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMAllTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsToTitlAbsRetrieval", description=( - "All-domain retrieval: no domain filtering. " - "Queries use title + abstract; corpus uses title + abstract. " - "Goal: retrieve citation-linked patent families across all IPC codes " + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to retrieve citation-linked patent families using query and target patent family representations of Title and Abstract across all technical domains." ), **_SHARED_METADATA, ) -class Dapfam_ALL_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMAllTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsToTitlAbsClmRetrieval", description=( - "All-domain retrieval (paper variant): no domain filtering. " - "Queries use title + abstract; corpus adds claims. " - "Goal: leverage claims text to retrieve citation-linked patent families across all IPC codes " + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to assess how adding Claims text to target patent family representations improves retrieval of citation-linked patent families across all technical domains." ), **_SHARED_METADATA, ) -class Dapfam_ALL_TitleAbstract_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMAllTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsToFullTextRetrieval", description=( - "All-domain retrieval: no domain filtering. " - "Queries use title + abstract; corpus uses title, abstract, claims, and description. " - "Goal: evaluate full-text retrieval across all IPC codes. " + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Title and Abstract query patent family representations and full-text target patent family representations across all technical domains." ), **_SHARED_METADATA, ) -class Dapfam_ALL_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMAllTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsClmToTitlAbsRetrieval", description=( - "All-domain retrieval: no domain filtering. " - "Queries use title, abstract, and claims; corpus uses title + abstract. " - "Goal: assess claim-augmented queries against surface-text patent family corpus. " + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to measure the effect of Claims-augmented query patent family representations when targets are limited to Title and Abstract across all technical domains." ), **_SHARED_METADATA, ) -class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaims( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval", description=( - "All-domain retrieval : no domain filtering. " - "Both queries and corpus use title, abstract, and claims. " - "Goal: reproduce the paper’s full-claims setup across all IPC codes. " + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval when both query and target patent families use Claims-augmented representations across all technical domains." ), **_SHARED_METADATA, ) -class Dapfam_ALL_TitleAbstractClaims_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMAllTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMAllTitlAbsClmToFullTextRetrieval", description=( - "All-domain retrieval: no domain filtering. " - "Queries use title, abstract, and claims; corpus adds description. " - "Goal: evaluate complete-text patent family retrieval across all IPC codes. " + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, no International Patent Classification-based filtering is applied. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Claims-augmented query patent family representations full-text target patent family representations across all technical domains." ), **_SHARED_METADATA, ) -# IN-domain (≥1 shared IPC top-three code) Tasks - - -class Dapfam_IN_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMInTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsToTitlAbsRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Queries use title + abstract; corpus uses title + abstract. " - "Goal: retrieve citation-linked patents within the same domain " + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to retrieve citation-linked patent families using query and target patent family representations of Title and Abstract within the same technical domain." ), **_SHARED_METADATA, ) -class Dapfam_IN_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMInTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsToTitlAbsClmRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Queries use title + abstract; corpus adds claims." - "Goal: leverage claims for in-domain patent retrieval." + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to assess how adding Claims text to target patent family representations improves retrieval of citation-linked patent families within the same technical domain." ), **_SHARED_METADATA, ) -class Dapfam_IN_TitleAbstract_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMInTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsToFullTextRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Queries use title + abstract; corpus uses title, abstract, claims, and description. " - "Goal: evaluate extended-text in-domain retrieval." + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Title and Abstract query patent family representations and full-text target patent family representations within the same technical domain." ), **_SHARED_METADATA, ) -class Dapfam_IN_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMInTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsClmToTitlAbsRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Queries use title, abstract, and claims; corpus uses title + abstract. " - "Goal: assess claim-driven in-domain queries." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to measure the effect of Claims-augmented query patent family representations when targets are limited to Title and Abstract within the same technical domain." ), **_SHARED_METADATA, ) -class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMInTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsClmToTitlAbsClmRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Both queries and corpus use title, abstract, and claims. " - "Goal: reproduce the in-domain full-claims setup." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval when both query and target patent families use Claims-augmented representations within the same technical domain." ), **_SHARED_METADATA, ) -class Dapfam_IN_TitleAbstractClaims_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMInTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "IN" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMInTitlAbsClmToFullTextRetrieval", description=( - "In-domain retrieval: query and target share at least one IPC top-three code." - "Queries use title, abstract, and claims; corpus adds description. " - "Goal: evaluate complete-text in-domain retrieval." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing at least one three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Claims-augmented query patent family representations full-text target patent family representations within the same technical domain." ), **_SHARED_METADATA, ) -# OUT-of-domain (no IPC top-three overlap) Tasks - - -class Dapfam_OUT_TitleAbstract_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMOutTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsToTitlAbsRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Queries use title + abstract; corpus uses title + abstract. " - "Goal: retrieve citation-linked patents across different IPC domains." + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to retrieve citation-linked patent families using query and target patent family representations of Title and Abstract across different technical domains." ), **_SHARED_METADATA, ) -class Dapfam_OUT_TitleAbstract_TitleAbstractClaims(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMOutTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsToTitlAbsClmRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Queries use title + abstract; corpus adds claims. " - "Goal: leverage claims for cross-domain patent retrieval." + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to assess how adding Claims text to target patent family representations improves retrieval of citation-linked patent families across different technical domains." ), **_SHARED_METADATA, ) -class Dapfam_OUT_TitleAbstract_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMOutTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstract"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsToFullTextRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Queries use title + abstract; corpus uses title, abstract, claims, and description. " - "Goal: evaluate extended-text cross-domain retrieval." + "In this patent family retrieval task, query patent families are represented by Title and Abstract, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Title and Abstract query patent family representations and full-text target patent family representations across different technical domains." ), **_SHARED_METADATA, ) -class Dapfam_OUT_TitleAbstractClaims_TitleAbstract(_DAPFAMMixin, AbsTaskRetrieval): +class DAPFAMOutTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsClmToTitlAbsRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Queries use title, abstract, and claims; corpus uses title + abstract. " - "Goal: assess claim-driven cross-domain queries." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title and Abstract. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to measure the effect of Claims-augmented query patent family representations when targets are limited to Title and Abstract across different technical domains." ), **_SHARED_METADATA, ) -class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaims( - _DAPFAMMixin, AbsTaskRetrieval -): +class DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Both queries and corpus use title, abstract, and claims. " - "Goal: reproduce the out-of-domain full-claims setup." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, and Claims. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval when both query and target patent families use Claims-augmented representations across different technical domains." ), **_SHARED_METADATA, ) -class Dapfam_OUT_TitleAbstractClaims_TitleAbstractClaimsDescription( - _DAPFAMMixin, AbsTaskRetrieval -): - # In the paper - domain_filter = "OUT" +class DAPFAMOutTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): + domain_filter = "OUT" query_fields = _QUERY_FIELDS["TitleAbstractClaims"] corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + in_paper = False metadata = TaskMetadata( - name=__qualname__, + name="DAPFAMOutTitlAbsClmToFullTextRetrieval", description=( - "Out-of-domain retrieval: query and target share no IPC top-three codes. " - "Queries use title, abstract, and claims; corpus adds description. " - "Goal: evaluate complete-text cross-domain retrieval." + "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " + "and target patent families are represented by Title, Abstract, Claims, and Description. " + "Relevant target families have a citation link (cited or citing) with the query family. " + "Additionally, only targets sharing no three-character International Patent Classification code with the query family. " + "Relevance and labelling scheme are described in detail in Section 3.4 and 3.5 of Ayaou et al. (2025), arXiv:2506.22141." + "Patents are aggregated and represented at the family level to reduce redundancy across jurisdictions. " + "The goal of the task is to evaluate retrieval performance using Claims-augmented query patent family representations full-text target patent family representations across different technical domains." ), **_SHARED_METADATA, ) From 33129b8f0016c9803f39e0de2dfa9fdbee436e55 Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 25 Jul 2025 18:06:39 +0200 Subject: [PATCH 17/29] changed default value of quantization to false --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b89168e095..7da61f9b36 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -126,7 +126,7 @@ def _dapfam_evaluate( qry_ids, qry_texts = zip(*queries.items()) encode_kwargs = kwargs.get("encode_kwargs", {}) - quantize = kwargs.get("quantize", True) + quantize = kwargs.get("quantize", False) # check similarity function name : print(model_wrapper.model.similarity_fn_name) From 58cf75c4fbffb0c29951bd2b177622323348200b Mon Sep 17 00:00:00 2001 From: iliass Date: Thu, 7 Aug 2025 15:48:41 +0200 Subject: [PATCH 18/29] added the import to all DAPFAM tasks; tested that the works; verified compliance with the checklist --- mteb/tasks/Retrieval/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 91a63662ae..4b7054b2e3 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -49,6 +49,7 @@ from .eng.CQADupstackWebmastersRetrieval import * from .eng.CQADupstackWordpressRetrieval import * from .eng.DBPediaRetrieval import * +from .eng.DAPFAMPatentRetrieval import * from .eng.FaithDialRetrieval import * from .eng.FeedbackQARetrieval import * from .eng.FEVERRetrieval import * From 4128a75fd8190f70296cef3f945e965113a8c990 Mon Sep 17 00:00:00 2001 From: Iliass Ayaou Date: Thu, 7 Aug 2025 16:37:38 +0200 Subject: [PATCH 19/29] Update mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py Co-authored-by: Roman Solomatin --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 7da61f9b36..6c21a6df2f 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -129,7 +129,7 @@ def _dapfam_evaluate( quantize = kwargs.get("quantize", False) # check similarity function name : - print(model_wrapper.model.similarity_fn_name) + logger.info(model_wrapper.model.similarity_fn_name) emb_c = model_wrapper.model.encode( list(corp_texts), **encode_kwargs, show_progress_bar=True ) From 6f3e0d1a26110de1a6874385e89e86a690a17ded Mon Sep 17 00:00:00 2001 From: iliass Date: Thu, 7 Aug 2025 17:39:17 +0200 Subject: [PATCH 20/29] added revision numbers to all dataset loading operations as well as the metadata itself --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 6c21a6df2f..b43df299ff 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -7,7 +7,7 @@ from sentence_transformers.quantization import quantize_embeddings from sklearn.metrics import average_precision_score -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval, logger from ....abstasks.TaskMetadata import TaskMetadata HF_REPO = "datalyes/DAPFAM_patent" @@ -22,7 +22,7 @@ }""" _SHARED_METADATA = dict( - dataset={"path": HF_REPO, "revision": "main"}, + dataset={"path": HF_REPO, "revision": "3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544"}, reference=REFERENCE, type="Retrieval", category="p2p", @@ -70,9 +70,9 @@ class _DAPFAMMixin: in_paper: bool = False def load_data(self, **_) -> tuple[dict, dict, dict]: - ds_c = load_dataset(HF_REPO, "corpus", split="train") - ds_q = load_dataset(HF_REPO, "queries", split="train") - ds_r = load_dataset(HF_REPO, "relations", split="train") + ds_c = load_dataset(HF_REPO, "corpus", split="train",revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") + ds_q = load_dataset(HF_REPO, "queries", split="train", revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") + ds_r = load_dataset(HF_REPO, "relations", split="train", revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") self.corpus = { "test": { From be082653492e643d53113a8c5d05adae81668e26 Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 22 Aug 2025 18:18:58 +0200 Subject: [PATCH 21/29] intermediate changes, refresh local branch --- mteb/leaderboard/benchmark_selector.py | 1 + mteb/tasks/Retrieval/__init__.py | 1 + mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index def6ab18f5..21066069dc 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,6 +3,7 @@ from dataclasses import dataclass import gradio as gr +from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual import mteb from mteb import Benchmark diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 36fa99148c..d3ef4116bb 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -55,6 +55,7 @@ from .eng.CQADupstackUnixRetrieval import * from .eng.CQADupstackWebmastersRetrieval import * from .eng.CQADupstackWordpressRetrieval import * +from .eng.DAPFAMPatentRetrieval import * from .eng.DBPediaRetrieval import * from .eng.DAPFAMPatentRetrieval import * from .eng.FaithDialRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b43df299ff..0f1a6699d7 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -29,7 +29,7 @@ task_subtypes=["Article retrieval", "Patent retrieval"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="ndcg@10", + main_score="ndcg_at_10", date=("1964-06-26", "2023-06-20"), domains=["Engineering", "Chemistry", "Legal"], license="cc-by-nc-sa-4.0", From 09afbd830df763680fab0964b61df4836c97aded Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 5 Sep 2025 12:20:09 +0200 Subject: [PATCH 22/29] intermediate changes, refresh local branch again --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 320 +++++------------- 1 file changed, 90 insertions(+), 230 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 0f1a6699d7..0fc7933c04 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -7,7 +7,7 @@ from sentence_transformers.quantization import quantize_embeddings from sklearn.metrics import average_precision_score -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval, logger +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval from ....abstasks.TaskMetadata import TaskMetadata HF_REPO = "datalyes/DAPFAM_patent" @@ -22,7 +22,7 @@ }""" _SHARED_METADATA = dict( - dataset={"path": HF_REPO, "revision": "3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544"}, + dataset={"path": HF_REPO, "revision": "main"}, reference=REFERENCE, type="Retrieval", category="p2p", @@ -36,6 +36,7 @@ annotations_creators="derived", sample_creation="created", bibtex_citation=BIBTEX, + judged_docs_only_flag=True, ) # text-field dictionaries @@ -70,9 +71,9 @@ class _DAPFAMMixin: in_paper: bool = False def load_data(self, **_) -> tuple[dict, dict, dict]: - ds_c = load_dataset(HF_REPO, "corpus", split="train",revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") - ds_q = load_dataset(HF_REPO, "queries", split="train", revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") - ds_r = load_dataset(HF_REPO, "relations", split="train", revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544") + ds_c = load_dataset(HF_REPO, "corpus", split="train") + ds_q = load_dataset(HF_REPO, "queries", split="train") + ds_r = load_dataset(HF_REPO, "relations", split="train") self.corpus = { "test": { @@ -91,161 +92,37 @@ def load_data(self, **_) -> tuple[dict, dict, dict]: } } - qrels: dict[str, dict[str, tuple[float, str]]] = {} + raw: dict[str, dict[str, tuple[float, str]]] = {} for r in ds_r: - qid, pid = r["query_id"], r["relevant_id"] - qrels.setdefault(qid, {})[pid] = ( - float(r["relevance_score"]), - r["domain_rel"], - ) - # 4) Assign for MTEB - self.relevant_docs = {"test": qrels} - self.data_loaded = True - return self.corpus, self.queries, self.relevant_docs - - def _dapfam_evaluate( - self, - model_wrapper, - split: str = "test", - subsets_to_run=None, - **kwargs, - ) -> dict[str, dict[str, float]]: - """Custom evaluation that quantises embeddings to uint8 before - normalisation (per the paper) and - computes recall / nDCG / mAP exactly like the paper if quantize=True and similarity=cosine. - It is fully deterministic. - """ - if not getattr(self, "data_loaded", False): - self.load_data() - - corpus = self.corpus[split] - queries = self.queries[split] - qrels = self.relevant_docs[split] - - corp_ids, corp_texts = zip(*corpus.items()) - qry_ids, qry_texts = zip(*queries.items()) - - encode_kwargs = kwargs.get("encode_kwargs", {}) - quantize = kwargs.get("quantize", False) - - # check similarity function name : - logger.info(model_wrapper.model.similarity_fn_name) - emb_c = model_wrapper.model.encode( - list(corp_texts), **encode_kwargs, show_progress_bar=True - ) - emb_q = model_wrapper.model.encode( - list(qry_texts), **encode_kwargs, show_progress_bar=True - ) - - # uint8 quantisation (per paper) if chosen then we go back to fp32 to avoid error - # by sentence transformers similarity function (doesn't accept quantized embeddings) - if quantize: - emb_c_q = quantize_embeddings(emb_c, precision="uint8") - emb_q_q = quantize_embeddings(emb_q, precision="uint8") - emb_c = emb_c_q.astype(np.float32) - emb_q = emb_q_q.astype(np.float32) - - sims = model_wrapper.model.similarity(emb_q, emb_c).cpu().numpy() - - # ranking per query dict[str, list[str]] - run: dict[str, list[str]] = {} - for i, qid in enumerate(qry_ids): - scores = sims[i] - idxs = np.argsort(-scores) - run[qid] = [(corp_ids[j], float(scores[j])) for j in idxs] - - # ---- metric helpers ---- - def ndcg_at_k(preds: list[str], refset: set[str], k: int) -> float: - if not refset: - return 1.0 - gains = [1.0 if pid in refset else 0.0 for pid in preds[:k]] - - def dcg(g): - return sum((2**v - 1) / math.log2(i + 2) for i, v in enumerate(g)) - - ideal = sorted(gains, reverse=True) - idcg = dcg(ideal) - # if ideal DCG is zero, return zero per paper - if idcg <= 0.0: - return 0.0 - return dcg(gains) / idcg - - rec10 = [] - rec100 = [] - ndc10 = [] - ndc100 = [] - map10 = [] - map100 = [] - - for qid, ranking in run.items(): - preds = [pid for pid, _ in ranking] - full = {d for d, (s, _) in qrels.get(qid, {}).items() if s > 0} - if self.domain_filter: - relset = { - pid - for pid, (s, dom) in qrels[qid].items() - if s > 0 and dom == self.domain_filter - } + qid = r["query_id"] + pid = r["relevant_id"] + raw.setdefault(qid, {})[pid] = (float(r["relevance_score"]), r["domain_rel"]) + self._qrels_raw = {"test": raw} + + + qrels_int: dict[str, dict[str, int]] = {} + for qid, pairs in raw.items(): + if self.domain_filter is None: + pos = {pid: 1 for pid, (s, dom) in pairs.items() if s > 0.0} else: - relset = full - - # recall@K - for k, rec_list in ((10, rec10), (100, rec100)): - hits = len(set(preds[:k]) & relset) - rec = hits / len(relset) if relset else 1.0 - rec_list.append(rec) - - # nDCG@K - ndc10.append(ndcg_at_k(preds, relset, 10)) - ndc100.append(ndcg_at_k(preds, relset, 100)) - - # mAP@K via rank-based scores over *top-K* only, per paper - for k, map_list in ((10, map10), (100, map100)): - # build binary truth for the top-k - topk = preds[:k] - y_true = [1 if pid in relset else 0 for pid in topk] - # if no positives exist, perfect; else zero if none in top-k - if sum(y_true) == 0: - ap = 1.0 if not relset else 0.0 - else: - # rank‐based scores k, k−1, …, 1 - y_scores = [k - i for i in range(k)] - ap = average_precision_score(y_true, y_scores) - map_list.append(ap) - - return { - "default": { - "recall@10": float(np.mean(rec10)), - "recall@100": float(np.mean(rec100)), - "ndcg@10": float(np.mean(ndc10)), - "ndcg@100": float(np.mean(ndc100)), - "map@10": float(np.mean(map10)), - "map@100": float(np.mean(map100)), - "main_score": float(np.mean(ndc10)), - } - } - - def evaluate( - self, - model_wrapper, - split: str = "test", - subsets_to_run=None, - **kwargs, - ) -> dict[str, dict[str, float]]: - return self._dapfam_evaluate(model_wrapper, split, subsets_to_run, **kwargs) + pos = {pid: 1 for pid, (s, dom) in pairs.items() if s > 0.0 and dom == self.domain_filter} + if pos: + qrels_int[qid] = pos + self.relevant_docs = {"test": qrels_int} + self.data_loaded = True + return self.corpus, self.queries, self.relevant_docs # ─────────────────────────────────────────────────── # DAPFAM Patent Family Retrieval Tasks - class DAPFAMAllTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMAllTitlAbsToTitlAbsRetrieval", + name='DAPFAMAllTitlAbsToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -258,14 +135,13 @@ class DAPFAMAllTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMAllTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMAllTitlAbsToTitlAbsClmRetrieval", + name='DAPFAMAllTitlAbsToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -278,14 +154,13 @@ class DAPFAMAllTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMAllTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMAllTitlAbsToFullTextRetrieval", + name='DAPFAMAllTitlAbsToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -298,14 +173,13 @@ class DAPFAMAllTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMAllTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMAllTitlAbsClmToTitlAbsRetrieval", + name='DAPFAMAllTitlAbsClmToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -318,14 +192,13 @@ class DAPFAMAllTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval", + name='DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -338,14 +211,13 @@ class DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMAllTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMAllTitlAbsClmToFullTextRetrieval", + name='DAPFAMAllTitlAbsClmToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -358,14 +230,13 @@ class DAPFAMAllTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMInTitlAbsToTitlAbsRetrieval", + name='DAPFAMInTitlAbsToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -378,14 +249,13 @@ class DAPFAMInTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMInTitlAbsToTitlAbsClmRetrieval", + name='DAPFAMInTitlAbsToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -398,14 +268,13 @@ class DAPFAMInTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMInTitlAbsToFullTextRetrieval", + name='DAPFAMInTitlAbsToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -418,14 +287,13 @@ class DAPFAMInTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMInTitlAbsClmToTitlAbsRetrieval", + name='DAPFAMInTitlAbsClmToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -438,14 +306,13 @@ class DAPFAMInTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMInTitlAbsClmToTitlAbsClmRetrieval", + name='DAPFAMInTitlAbsClmToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -458,14 +325,13 @@ class DAPFAMInTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMInTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "IN" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + domain_filter = 'IN' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMInTitlAbsClmToFullTextRetrieval", + name='DAPFAMInTitlAbsClmToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -478,14 +344,13 @@ class DAPFAMInTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMOutTitlAbsToTitlAbsRetrieval", + name='DAPFAMOutTitlAbsToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -498,14 +363,13 @@ class DAPFAMOutTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMOutTitlAbsToTitlAbsClmRetrieval", + name='DAPFAMOutTitlAbsToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -518,14 +382,13 @@ class DAPFAMOutTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstract"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstract'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMOutTitlAbsToFullTextRetrieval", + name='DAPFAMOutTitlAbsToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -538,14 +401,13 @@ class DAPFAMOutTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstract"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstract'] in_paper = False metadata = TaskMetadata( - name="DAPFAMOutTitlAbsClmToTitlAbsRetrieval", + name='DAPFAMOutTitlAbsClmToTitlAbsRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -558,14 +420,13 @@ class DAPFAMOutTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] in_paper = True metadata = TaskMetadata( - name="DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval", + name='DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -578,14 +439,13 @@ class DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) - class DAPFAMOutTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = "OUT" - query_fields = _QUERY_FIELDS["TitleAbstractClaims"] - corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] + domain_filter = 'OUT' + query_fields = _QUERY_FIELDS['TitleAbstractClaims'] + corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] in_paper = False metadata = TaskMetadata( - name="DAPFAMOutTitlAbsClmToFullTextRetrieval", + name='DAPFAMOutTitlAbsClmToFullTextRetrieval', description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -596,4 +456,4 @@ class DAPFAMOutTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): "The goal of the task is to evaluate retrieval performance using Claims-augmented query patent family representations full-text target patent family representations across different technical domains." ), **_SHARED_METADATA, - ) + ) \ No newline at end of file From c64bd5536444310e137ebb379f57420bb3611e75 Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 5 Sep 2025 14:19:58 +0200 Subject: [PATCH 23/29] scale back to standard evaluation with empty set exclusion, various cosmetic/formatting changes --- mteb/leaderboard/benchmark_selector.py | 1 - mteb/tasks/Retrieval/__init__.py | 1 - .../Retrieval/eng/DAPFAMPatentRetrieval.py | 213 ++++++++++-------- 3 files changed, 125 insertions(+), 90 deletions(-) diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 21066069dc..def6ab18f5 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,7 +3,6 @@ from dataclasses import dataclass import gradio as gr -from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual import mteb from mteb import Benchmark diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index d3ef4116bb..670d4ea5b1 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -57,7 +57,6 @@ from .eng.CQADupstackWordpressRetrieval import * from .eng.DAPFAMPatentRetrieval import * from .eng.DBPediaRetrieval import * -from .eng.DAPFAMPatentRetrieval import * from .eng.FaithDialRetrieval import * from .eng.FeedbackQARetrieval import * from .eng.FEVERRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 0fc7933c04..9aa1053daf 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -1,28 +1,25 @@ from __future__ import annotations -import math - -import numpy as np from datasets import load_dataset -from sentence_transformers.quantization import quantize_embeddings -from sklearn.metrics import average_precision_score from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval from ....abstasks.TaskMetadata import TaskMetadata HF_REPO = "datalyes/DAPFAM_patent" REFERENCE = "https://arxiv.org/abs/2506.22141" -BIBTEX = r"""@misc{ayaou2025dapfam, - title = {DAPFAM: A Domain-Aware Patent Retrieval Dataset Aggregated at the Family Level}, - author = {Ayaou, Iliass and Cavallucci, Denis and Chibane, Hicham}, - year = {2025}, - eprint = {2506.22141}, - archivePrefix= {arXiv}, - primaryClass = {cs.CL} +BIBTEX = r""" +@misc{ayaou2025dapfamdomainawarefamilyleveldataset, + title = {DAPFAM: A Domain-Aware Family-level Dataset to benchmark cross domain patent retrieval}, + author = {Iliass Ayaou and Denis Cavallucci and Hicham Chibane}, + year = {2025}, + eprint = {2506.22141}, + archivePrefix = {arXiv}, + primaryClass = {cs.CL}, + url = {https://arxiv.org/abs/2506.22141}, }""" _SHARED_METADATA = dict( - dataset={"path": HF_REPO, "revision": "main"}, + dataset={"path": HF_REPO, "revision": "3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544"}, reference=REFERENCE, type="Retrieval", category="p2p", @@ -36,7 +33,7 @@ annotations_creators="derived", sample_creation="created", bibtex_citation=BIBTEX, - judged_docs_only_flag=True, + judged_docs_only_flag=True, ) # text-field dictionaries @@ -71,9 +68,24 @@ class _DAPFAMMixin: in_paper: bool = False def load_data(self, **_) -> tuple[dict, dict, dict]: - ds_c = load_dataset(HF_REPO, "corpus", split="train") - ds_q = load_dataset(HF_REPO, "queries", split="train") - ds_r = load_dataset(HF_REPO, "relations", split="train") + ds_c = load_dataset( + HF_REPO, + "corpus", + split="train", + revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + ) + ds_q = load_dataset( + HF_REPO, + "queries", + split="train", + revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + ) + ds_r = load_dataset( + HF_REPO, + "relations", + split="train", + revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + ) self.corpus = { "test": { @@ -96,33 +108,41 @@ def load_data(self, **_) -> tuple[dict, dict, dict]: for r in ds_r: qid = r["query_id"] pid = r["relevant_id"] - raw.setdefault(qid, {})[pid] = (float(r["relevance_score"]), r["domain_rel"]) + raw.setdefault(qid, {})[pid] = ( + float(r["relevance_score"]), + r["domain_rel"], + ) self._qrels_raw = {"test": raw} - qrels_int: dict[str, dict[str, int]] = {} for qid, pairs in raw.items(): if self.domain_filter is None: pos = {pid: 1 for pid, (s, dom) in pairs.items() if s > 0.0} else: - pos = {pid: 1 for pid, (s, dom) in pairs.items() if s > 0.0 and dom == self.domain_filter} - if pos: + pos = { + pid: 1 + for pid, (s, dom) in pairs.items() + if s > 0.0 and dom == self.domain_filter + } + if pos: qrels_int[qid] = pos self.relevant_docs = {"test": qrels_int} self.data_loaded = True return self.corpus, self.queries, self.relevant_docs + # ─────────────────────────────────────────────────── # DAPFAM Patent Family Retrieval Tasks + class DAPFAMAllTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMAllTitlAbsToTitlAbsRetrieval', + name="DAPFAMAllTitlAbsToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -135,13 +155,14 @@ class DAPFAMAllTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMAllTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMAllTitlAbsToTitlAbsClmRetrieval', + name="DAPFAMAllTitlAbsToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -154,13 +175,14 @@ class DAPFAMAllTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMAllTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMAllTitlAbsToFullTextRetrieval', + name="DAPFAMAllTitlAbsToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -173,13 +195,14 @@ class DAPFAMAllTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMAllTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMAllTitlAbsClmToTitlAbsRetrieval', + name="DAPFAMAllTitlAbsClmToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -192,13 +215,14 @@ class DAPFAMAllTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval', + name="DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -211,13 +235,14 @@ class DAPFAMAllTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMAllTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): domain_filter = None - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMAllTitlAbsClmToFullTextRetrieval', + name="DAPFAMAllTitlAbsClmToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -230,13 +255,14 @@ class DAPFAMAllTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMInTitlAbsToTitlAbsRetrieval', + name="DAPFAMInTitlAbsToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -249,13 +275,14 @@ class DAPFAMInTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMInTitlAbsToTitlAbsClmRetrieval', + name="DAPFAMInTitlAbsToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -268,13 +295,14 @@ class DAPFAMInTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMInTitlAbsToFullTextRetrieval', + name="DAPFAMInTitlAbsToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -287,13 +315,14 @@ class DAPFAMInTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMInTitlAbsClmToTitlAbsRetrieval', + name="DAPFAMInTitlAbsClmToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -306,13 +335,14 @@ class DAPFAMInTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMInTitlAbsClmToTitlAbsClmRetrieval', + name="DAPFAMInTitlAbsClmToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -325,13 +355,14 @@ class DAPFAMInTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMInTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'IN' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + domain_filter = "IN" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMInTitlAbsClmToFullTextRetrieval', + name="DAPFAMInTitlAbsClmToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -344,13 +375,14 @@ class DAPFAMInTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMOutTitlAbsToTitlAbsRetrieval', + name="DAPFAMOutTitlAbsToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title and Abstract. " @@ -363,13 +395,14 @@ class DAPFAMOutTitlAbsToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMOutTitlAbsToTitlAbsClmRetrieval', + name="DAPFAMOutTitlAbsToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -382,13 +415,14 @@ class DAPFAMOutTitlAbsToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstract'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstract"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMOutTitlAbsToFullTextRetrieval', + name="DAPFAMOutTitlAbsToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title and Abstract, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -401,13 +435,14 @@ class DAPFAMOutTitlAbsToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstract'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstract"] in_paper = False metadata = TaskMetadata( - name='DAPFAMOutTitlAbsClmToTitlAbsRetrieval', + name="DAPFAMOutTitlAbsClmToTitlAbsRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title and Abstract. " @@ -420,13 +455,14 @@ class DAPFAMOutTitlAbsClmToTitlAbsRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaims'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaims"] in_paper = True metadata = TaskMetadata( - name='DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval', + name="DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, and Claims. " @@ -439,13 +475,14 @@ class DAPFAMOutTitlAbsClmToTitlAbsClmRetrieval(_DAPFAMMixin, AbsTaskRetrieval): **_SHARED_METADATA, ) + class DAPFAMOutTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): - domain_filter = 'OUT' - query_fields = _QUERY_FIELDS['TitleAbstractClaims'] - corpus_fields = _CORPUS_FIELDS['TitleAbstractClaimsDescription'] + domain_filter = "OUT" + query_fields = _QUERY_FIELDS["TitleAbstractClaims"] + corpus_fields = _CORPUS_FIELDS["TitleAbstractClaimsDescription"] in_paper = False metadata = TaskMetadata( - name='DAPFAMOutTitlAbsClmToFullTextRetrieval', + name="DAPFAMOutTitlAbsClmToFullTextRetrieval", description=( "In this patent family retrieval task, query patent families are represented by Title, Abstract, and Claims, " "and target patent families are represented by Title, Abstract, Claims, and Description. " @@ -456,4 +493,4 @@ class DAPFAMOutTitlAbsClmToFullTextRetrieval(_DAPFAMMixin, AbsTaskRetrieval): "The goal of the task is to evaluate retrieval performance using Claims-augmented query patent family representations full-text target patent family representations across different technical domains." ), **_SHARED_METADATA, - ) \ No newline at end of file + ) From 2125f1604054f3f8fe413c6e12d0bbce81a17558 Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 5 Sep 2025 15:13:45 +0200 Subject: [PATCH 24/29] minor cosmetic/formatting changes --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index 9aa1053daf..b7a9079fcf 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -9,14 +9,15 @@ REFERENCE = "https://arxiv.org/abs/2506.22141" BIBTEX = r""" @misc{ayaou2025dapfamdomainawarefamilyleveldataset, - title = {DAPFAM: A Domain-Aware Family-level Dataset to benchmark cross domain patent retrieval}, + title = {DAPFAM: A Domain-Aware Family-level Dataset to benchmark cross domain patent retrieval}, author = {Iliass Ayaou and Denis Cavallucci and Hicham Chibane}, year = {2025}, eprint = {2506.22141}, archivePrefix = {arXiv}, primaryClass = {cs.CL}, - url = {https://arxiv.org/abs/2506.22141}, -}""" + url = {https://arxiv.org/abs/2506.22141}, +} +""" _SHARED_METADATA = dict( dataset={"path": HF_REPO, "revision": "3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544"}, @@ -52,7 +53,7 @@ ], } -# paper variants used in Table 4 +# Text representations used in the paper _IN_PAPER = { ("TitleAbstract", "TitleAbstractClaims"), ("TitleAbstractClaims", "TitleAbstractClaims"), From d8774fa3fd9206f28058ef14a33059ecb0b912db Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 5 Sep 2025 17:34:54 +0200 Subject: [PATCH 25/29] fixed main metric to be ndcg_at_100 as in the paper --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b7a9079fcf..a0c3bc3c01 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -27,7 +27,7 @@ task_subtypes=["Article retrieval", "Patent retrieval"], eval_splits=["test"], eval_langs=["eng-Latn"], - main_score="ndcg_at_10", + main_score="ndcg_at_100", date=("1964-06-26", "2023-06-20"), domains=["Engineering", "Chemistry", "Legal"], license="cc-by-nc-sa-4.0", From 873e3b2d65520fe9341d295584d87214586fab8a Mon Sep 17 00:00:00 2001 From: iliass Date: Fri, 5 Sep 2025 18:46:56 +0200 Subject: [PATCH 26/29] removed old code artifacts from previous versions --- mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index a0c3bc3c01..b61b24d064 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -34,7 +34,6 @@ annotations_creators="derived", sample_creation="created", bibtex_citation=BIBTEX, - judged_docs_only_flag=True, ) # text-field dictionaries @@ -53,13 +52,6 @@ ], } -# Text representations used in the paper -_IN_PAPER = { - ("TitleAbstract", "TitleAbstractClaims"), - ("TitleAbstractClaims", "TitleAbstractClaims"), -} - - # MIX-IN with shared logic + metric implementation class _DAPFAMMixin: # class-level attributes are filled in each concrete subclass From 60201613e1460b247e28b25df78f781027e7b796 Mon Sep 17 00:00:00 2001 From: iliass Date: Mon, 8 Sep 2025 15:13:38 +0200 Subject: [PATCH 27/29] read appropriate loading arguments from task metadata, remove unecessary class attribute --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index b61b24d064..a5f037ec91 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -20,12 +20,13 @@ """ _SHARED_METADATA = dict( - dataset={"path": HF_REPO, "revision": "3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544"}, + dataset={"path": HF_REPO, "revision": "780f4011d60297fc6e97a4119b0c516d13afea2d"}, reference=REFERENCE, type="Retrieval", category="p2p", task_subtypes=["Article retrieval", "Patent retrieval"], eval_splits=["test"], + load_splits=["train"], eval_langs=["eng-Latn"], main_score="ndcg_at_100", date=("1964-06-26", "2023-06-20"), @@ -52,32 +53,38 @@ ], } -# MIX-IN with shared logic + metric implementation + +# MIX-IN with shared logic class _DAPFAMMixin: # class-level attributes are filled in each concrete subclass domain_filter: str | None = None query_fields: list[str] = [] corpus_fields: list[str] = [] - in_paper: bool = False - def load_data(self, **_) -> tuple[dict, dict, dict]: + def load_data(self, **kwargs) -> tuple[dict, dict, dict]: ds_c = load_dataset( - HF_REPO, + kwargs.get("dataset", {}).get("path", HF_REPO), "corpus", - split="train", - revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + split=kwargs.get("load_splits", "train"), + revision=kwargs.get("dataset", {}).get( + "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" + ), ) ds_q = load_dataset( - HF_REPO, + kwargs.get("dataset", {}).get("path", HF_REPO), "queries", - split="train", - revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + split=kwargs.get("load_splits", "train"), + revision=kwargs.get("dataset", {}).get( + "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" + ), ) ds_r = load_dataset( - HF_REPO, + kwargs.get("dataset", {}).get("path", HF_REPO), "relations", - split="train", - revision="3ad6eab6ed9b5fb1c0609b4dbf40e391ebb5a544", + split=kwargs.get("load_splits", "train"), + revision=kwargs.get("dataset", {}).get( + "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" + ), ) self.corpus = { From def9c1656ddd869d61458155b8273f53c6388307 Mon Sep 17 00:00:00 2001 From: iliass Date: Mon, 8 Sep 2025 16:39:20 +0200 Subject: [PATCH 28/29] reformat bibtex ( remark on the assertion since it tries to match literal string instead of bibtex formatting, format inconsistent with arXiv default), fixed metadata, parameters read from task metadata, all tests passed --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index a5f037ec91..a099f84dd5 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -7,26 +7,24 @@ HF_REPO = "datalyes/DAPFAM_patent" REFERENCE = "https://arxiv.org/abs/2506.22141" -BIBTEX = r""" -@misc{ayaou2025dapfamdomainawarefamilyleveldataset, - title = {DAPFAM: A Domain-Aware Family-level Dataset to benchmark cross domain patent retrieval}, - author = {Iliass Ayaou and Denis Cavallucci and Hicham Chibane}, - year = {2025}, - eprint = {2506.22141}, - archivePrefix = {arXiv}, - primaryClass = {cs.CL}, - url = {https://arxiv.org/abs/2506.22141}, -} -""" +BIBTEX = r"""@misc{ayaou2025dapfamdomainawarefamilyleveldataset, + archiveprefix = {arXiv}, + author = {Iliass Ayaou and Denis Cavallucci and Hicham Chibane}, + eprint = {2506.22141}, + primaryclass = {cs.CL}, + title = {DAPFAM: A Domain-Aware Family-level Dataset to benchmark cross domain patent retrieval}, + url = {https://arxiv.org/abs/2506.22141}, + year = {2025}, +}""" _SHARED_METADATA = dict( dataset={"path": HF_REPO, "revision": "780f4011d60297fc6e97a4119b0c516d13afea2d"}, reference=REFERENCE, type="Retrieval", + modalities=["text"], category="p2p", task_subtypes=["Article retrieval", "Patent retrieval"], - eval_splits=["test"], - load_splits=["train"], + eval_splits=["train"], eval_langs=["eng-Latn"], main_score="ndcg_at_100", date=("1964-06-26", "2023-06-20"), @@ -34,7 +32,9 @@ license="cc-by-nc-sa-4.0", annotations_creators="derived", sample_creation="created", + dialect=[], bibtex_citation=BIBTEX, + is_public=True, ) # text-field dictionaries @@ -65,7 +65,7 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: ds_c = load_dataset( kwargs.get("dataset", {}).get("path", HF_REPO), "corpus", - split=kwargs.get("load_splits", "train"), + split=kwargs.get("eval_splits", "train"), revision=kwargs.get("dataset", {}).get( "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" ), @@ -73,7 +73,7 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: ds_q = load_dataset( kwargs.get("dataset", {}).get("path", HF_REPO), "queries", - split=kwargs.get("load_splits", "train"), + split=kwargs.get("eval_splits", "train"), revision=kwargs.get("dataset", {}).get( "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" ), @@ -81,14 +81,14 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: ds_r = load_dataset( kwargs.get("dataset", {}).get("path", HF_REPO), "relations", - split=kwargs.get("load_splits", "train"), + split=kwargs.get("eval_splits", "train"), revision=kwargs.get("dataset", {}).get( "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" ), ) self.corpus = { - "test": { + "train": { r["relevant_id"]: "\n".join( str(r[f]) for f in self.corpus_fields if r.get(f) ) @@ -96,7 +96,7 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: } } self.queries = { - "test": { + "train": { r["query_id"]: "\n".join( str(r[f]) for f in self.query_fields if r.get(f) ) @@ -112,7 +112,7 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: float(r["relevance_score"]), r["domain_rel"], ) - self._qrels_raw = {"test": raw} + self._qrels_raw = {"train": raw} qrels_int: dict[str, dict[str, int]] = {} for qid, pairs in raw.items(): @@ -127,7 +127,7 @@ def load_data(self, **kwargs) -> tuple[dict, dict, dict]: if pos: qrels_int[qid] = pos - self.relevant_docs = {"test": qrels_int} + self.relevant_docs = {"train": qrels_int} self.data_loaded = True return self.corpus, self.queries, self.relevant_docs From 54ddbb9241b9c93bccc7e977bc8b6c3f142aa396 Mon Sep 17 00:00:00 2001 From: iliass Date: Mon, 8 Sep 2025 17:06:30 +0200 Subject: [PATCH 29/29] refactor data loading to read from metadata class attributes --- .../Retrieval/eng/DAPFAMPatentRetrieval.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py index a099f84dd5..afee1b2b58 100644 --- a/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py +++ b/mteb/tasks/Retrieval/eng/DAPFAMPatentRetrieval.py @@ -63,28 +63,22 @@ class _DAPFAMMixin: def load_data(self, **kwargs) -> tuple[dict, dict, dict]: ds_c = load_dataset( - kwargs.get("dataset", {}).get("path", HF_REPO), + self.metadata.dataset["path"], "corpus", - split=kwargs.get("eval_splits", "train"), - revision=kwargs.get("dataset", {}).get( - "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" - ), + split=self.metadata.eval_splits[0], + revision=self.metadata.dataset["revision"], ) ds_q = load_dataset( - kwargs.get("dataset", {}).get("path", HF_REPO), + self.metadata.dataset["path"], "queries", - split=kwargs.get("eval_splits", "train"), - revision=kwargs.get("dataset", {}).get( - "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" - ), + split=self.metadata.eval_splits[0], + revision=self.metadata.dataset["revision"], ) ds_r = load_dataset( - kwargs.get("dataset", {}).get("path", HF_REPO), + self.metadata.dataset["path"], "relations", - split=kwargs.get("eval_splits", "train"), - revision=kwargs.get("dataset", {}).get( - "revision", "780f4011d60297fc6e97a4119b0c516d13afea2d" - ), + split=self.metadata.eval_splits[0], + revision=self.metadata.dataset["revision"], ) self.corpus = {