diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 36641507ac..50e2b45cc5 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -71,7 +71,7 @@ def load_results( MTEB_EN = Benchmark( - name="MTEB(eng, beta)", + name="MTEB(eng)", tasks=MTEBTasks( get_tasks( tasks=[ @@ -128,7 +128,13 @@ def load_results( get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]), ), ), - description="English benchmarks from MTEB", + description="""The new English Massive Text Embedding Benchmark. +This benchmark was created to account for the fact that many models have now been finetuned +to tasks in the original MTEB, and contains tasks that are not as frequently used for model training. +This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance. + +The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab. + """, citation="", contacts=["KennethEnevoldsen", "Muennighoff"], ) @@ -216,7 +222,12 @@ def load_results( get_task("STS22", eval_splits=["test"], hf_subsets=["en"]), ) ), - description="The original English benchmark by Muennighoff et al., (2023).", + description="""The original English benchmark by Muennighoff et al., (2023). +This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard). + +> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead, +as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance. + """, citation="""@inproceedings{muennighoff-etal-2023-mteb, title = "{MTEB}: Massive Text Embedding Benchmark", author = "Muennighoff, Niklas and @@ -275,7 +286,7 @@ def load_results( "STS22", ], ), - description="Main Russian benchmarks from MTEB", + description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.", reference="https://aclanthology.org/2023.eacl-main.148/", citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, @@ -324,8 +335,8 @@ def load_results( "LegalQuAD", ] ), - description="Legal benchmarks from MTEB.", - reference="https://aclanthology.org/2023.eacl-main.148/", + description="A benchmark of retrieval tasks in the legal domain.", + reference=None, citation=None, ) @@ -365,7 +376,10 @@ def load_results( "Tatoeba", ] ), - description="BitextMining benchmark from MINERS", + description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the + ability of multilingual LMs in semantic retrieval tasks, + including bitext mining and classification via retrieval-augmented contexts. + """, reference="https://arxiv.org/pdf/2406.07424", citation=""" @article{winata2024miners, @@ -533,7 +547,7 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),) ), - description="Main French benchmarks from MTEB", + description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.", reference="https://arxiv.org/abs/2405.20468", citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence, title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, @@ -581,7 +595,7 @@ def load_results( "STS22", ], ), - description="Main German benchmarks from MTEB", + description="A benchmark for text-embedding performance in German.", reference="https://arxiv.org/html/2401.02709v1", citation="""@misc{wehrli2024germantextembeddingclustering, title={German Text Embedding Clustering Benchmark}, @@ -613,7 +627,7 @@ def load_results( "KorSTS", ], ), - description="Main Korean benchmarks from MTEB", + description="A benchmark and leaderboard for evaluation of text embedding in Korean.", reference=None, citation=None, ) @@ -650,7 +664,11 @@ def load_results( ) + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),), ), - description="Main Polish benchmarks from MTEB", + description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP +tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish +NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created +consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for +two novel clustering tasks.""", # Rephrased from the abstract reference="https://arxiv.org/abs/2405.10138", citation="""@article{poswiata2024plmteb, title={PL-MTEB: Polish Massive Text Embedding Benchmark}, @@ -695,14 +713,14 @@ def load_results( "typescript", ], ), - description="Main code benchmarks from MTEB", + description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.", reference=None, citation=None, ) MTEB_multilingual = Benchmark( - name="MTEB(Multilingual, beta)", + name="MTEB(Multilingual)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -840,7 +858,7 @@ def load_results( "MIRACLRetrievalHardNegatives", ], ), - description="The Multilingual benchmarks from MMTEB. Currently under development.", + description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -875,7 +893,7 @@ def load_results( "ESCIReranking", ], ), - description="Main Japanese benchmarks from MTEB", + description="JMTEB is a benchmark for evaluating Japanese text embedding models.", reference="https://github.com/sbintuitions/JMTEB", citation=None, ) @@ -915,7 +933,7 @@ def load_results( ] MTEB_INDIC = Benchmark( - name="MTEB(Indic, beta)", + name="MTEB(Indic)", tasks=get_tasks( tasks=[ # Bitext @@ -952,7 +970,7 @@ def load_results( languages=indic_languages, exclusive_language_filter=True, ), - description="Main Indic benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1003,7 +1021,7 @@ def load_results( ] MTEB_EU = Benchmark( - name="MTEB(Europe, beta)", + name="MTEB(Europe)", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1084,7 +1102,7 @@ def load_results( languages=eu_languages, exclusive_language_filter=True, ), - description="Main European benchmark from MMTEB", + description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.", reference=None, citation=None, contacts=["KennethEnevoldsen", "isaac-chung"], @@ -1102,7 +1120,10 @@ def load_results( "LEMBWikimQARetrieval", ], ), - description="The main benchmark for evaluating long document retrieval.", + description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval. + The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks, + featuring documents of varying length and dispersed target information. + """, # Pieced together from paper abstract. reference="https://arxiv.org/abs/2404.12096v2", citation="""@article{zhu2024longembed, title={LongEmbed: Extending Embedding Models for Long Context Retrieval}, @@ -1117,7 +1138,13 @@ def load_results( tasks=get_tasks( tasks=["BrightRetrieval"], ), - description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.", + description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. + BRIGHT is the first text retrieval + benchmark that requires intensive reasoning to retrieve relevant documents with + a dataset consisting of 1,384 real-world queries spanning diverse domains, such as + economics, psychology, mathematics, and coding. These queries are drawn from + naturally occurring and carefully curated human data. + """, reference="https://brightbenchmark.github.io/", citation="""@article{su2024bright, title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval}, diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index cb806e4671..5ee5a6b9da 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -6,6 +6,7 @@ import tempfile import time from pathlib import Path +from typing import Literal from urllib.parse import urlencode import gradio as gr @@ -48,9 +49,12 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: return md +DEFAULT_BENCHMARK_NAME = "MTEB(Multilingual)" + + def set_benchmark_on_load(request: gr.Request): query_params = request.query_params - return query_params.get("benchmark_name", "MTEB(Multilingual, beta)") + return query_params.get("benchmark_name", DEFAULT_BENCHMARK_NAME) def download_table(table: pd.DataFrame) -> Path: @@ -117,23 +121,75 @@ def update_task_info(task_names: str) -> gr.DataFrame: return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) +# Model sizes in million parameters +MIN_MODEL_SIZE, MAX_MODEL_SIZE = 0, 10_000 + + +def filter_models( + model_names, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting, +): + lower, upper = model_size + # Setting to None, when the user doesn't specify anything + if (lower == MIN_MODEL_SIZE) and (upper == MAX_MODEL_SIZE): + lower, upper = None, None + else: + # Multiplying by millions + lower = lower * 1e6 + upper = upper * 1e6 + model_metas = mteb.get_model_metas( + model_names=model_names, + open_weights=availability, + use_instructions=instructions, + frameworks=compatibility, + n_parameters_range=(lower, upper), + ) + tasks = mteb.get_tasks(tasks=task_select) + models_to_keep = set() + for model_meta in model_metas: + is_model_zero_shot = model_meta.is_zero_shot_on(tasks) + if is_model_zero_shot is None: + if zero_shot_setting == "hard": + continue + elif not is_model_zero_shot: + if zero_shot_setting != "off": + continue + models_to_keep.add(model_meta.name) + return list(models_to_keep) + + logger.info("Loading all benchmark results") all_results = load_results() -# Model sizes in million parameters -min_model_size, max_model_size = 0, 10_000 - benchmarks = mteb.get_benchmarks() all_benchmark_results = { benchmark.name: benchmark.load_results(base_results=all_results) for benchmark in benchmarks } -default_benchmark = mteb.get_benchmark("MTEB(Multilingual, beta)") +default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) default_results = all_benchmark_results[default_benchmark.name] logger.info("Benchmark results loaded") default_scores = default_results.get_scores(format="long") -summary_table, per_task_table = scores_to_tables(default_scores) +all_models = list({entry["model_name"] for entry in default_scores}) +filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="soft", +) + +summary_table, per_task_table = scores_to_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] +) benchmark_select = gr.Dropdown( [bench.name for bench in benchmarks], @@ -207,7 +263,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: with gr.Row(): searchbar = gr.Textbox( label="Search Models", - info="Search models by name (RegEx sensitive. Separate queries with `|`)", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) compatibility = gr.CheckboxGroup( @@ -258,14 +314,14 @@ def update_task_info(task_names: str) -> gr.DataFrame: interactive=True, ) model_size = RangeSlider( - minimum=min_model_size, - maximum=max_model_size, - value=(min_model_size, max_model_size), + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), label="Model Size (#M Parameters)", interactive=True, ) scores = gr.State(default_scores) - models = gr.State(list({entry["model_name"] for entry in default_scores})) + models = gr.State(filtered_models) with gr.Row(): with gr.Column(): description = gr.Markdown( @@ -295,6 +351,10 @@ def update_task_info(task_names: str) -> gr.DataFrame: """ ) summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] + ) with gr.Accordion( "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", open=False, @@ -308,10 +368,19 @@ def update_task_info(task_names: str) -> gr.DataFrame: **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. """ ) - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): + gr.Markdown( + """ +A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. +E.g., if a model is trained on Natural Questions, it cannot be considered zero-shot on benchmarks containing the task “NQ” which is derived from Natural Questions. +This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. +Distilled, further fine-tunes or in other ways, derivative models inherit the datasets of their parent models. +Based on community feedback and research findings, This definition could change in the future. + """ + ) with gr.Tab("Performance per task"): per_task_table.render() download_per_task = gr.DownloadButton("Download Table") @@ -405,51 +474,14 @@ def update_task_list(benchmark_name, type_select, domain_select, lang_select): outputs=[task_select], ) - def filter_models( - model_names, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot_setting, - ): - lower, upper = model_size - # Setting to None, when the user doesn't specify anything - if (lower == min_model_size) and (upper == max_model_size): - lower, upper = None, None - else: - # Multiplying by millions - lower = lower * 1e6 - upper = upper * 1e6 - model_metas = mteb.get_model_metas( - model_names=model_names, - open_weights=availability, - use_instructions=instructions, - frameworks=compatibility, - n_parameters_range=(lower, upper), - ) - tasks = mteb.get_tasks(tasks=task_select) - models_to_keep = set() - for model_meta in model_metas: - is_model_zero_shot = model_meta.is_zero_shot_on(tasks) - if is_model_zero_shot is None: - if zero_shot_setting == "hard": - continue - elif not is_model_zero_shot: - if zero_shot_setting != "off": - continue - models_to_keep.add(model_meta.name) - return list(models_to_keep) - def update_models( - scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot, + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["hard", "soft", "off"], ): start_time = time.time() model_names = list({entry["model_name"] for entry in scores}) @@ -544,7 +576,7 @@ def update_models( ], outputs=[models], ) - zero_shot.input( + zero_shot.change( update_models, inputs=[ scores, @@ -594,7 +626,7 @@ def update_tables( inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], ) - searchbar.input( + searchbar.submit( update_tables, inputs=[scores, searchbar, task_select, models], outputs=[summary_table, per_task_table], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 041df47094..ef28392cf7 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -142,6 +142,11 @@ def scores_to_tables( names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return gr.DataFrame(no_results_frame), gr.DataFrame(no_results_frame) models_to_remove = list(per_task[to_remove].index) typed_mean = mean_per_type.mean(skipna=False, axis=1) overall_mean = per_task.mean(skipna=False, axis=1) @@ -218,7 +223,11 @@ def scores_to_tables( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = ( joint_table.style.format( - {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="", ) .highlight_min("Rank (Borda)", props="font-weight: bold") .highlight_max(subset=score_columns, props="font-weight: bold") @@ -226,7 +235,7 @@ def scores_to_tables( task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns + "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") return ( gr.DataFrame( diff --git a/mteb/model_meta.py b/mteb/model_meta.py index eed74c5b49..b0dbccf24e 100644 --- a/mteb/model_meta.py +++ b/mteb/model_meta.py @@ -68,7 +68,8 @@ class ModelMeta(BaseModel): release_date: The date the model's revision was released. license: The license under which the model is released. Required if open_weights is True. open_weights: Whether the model is open source or proprietary. - public_training_code: Whether the code used to train the model is publicly available. + public_training_code: A link to the publicly available training code. If none it is assumed that the training code is not publicly available. + public_training_data: A link to the publicly available training data. If none it is assumed that the training data is not publicly available. similarity_fn_name: The distance metric used by the model. framework: The framework the model is implemented in, can be a list of frameworks e.g. `["Sentence Transformers", "PyTorch"]`. reference: A URL to the model's page on huggingface or another source. @@ -97,6 +98,7 @@ class ModelMeta(BaseModel): license: str | None open_weights: bool | None public_training_code: str | None + public_training_data: str | bool | None framework: list[FRAMEWORKS] reference: STR_URL | None = None similarity_fn_name: DISTANCE_METRICS | None diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index b7217d1ef9..dd3cd1c8df 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -116,6 +116,7 @@ url={https://arxiv.org/abs/2407.18887}, }""", public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -164,7 +165,8 @@ use_instructions=True, adapted_from="sentence-transformers/all-MiniLM-L6-v2", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -213,7 +215,8 @@ use_instructions=True, adapted_from="intfloat/e5-small-unsupervised", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -262,7 +265,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v1.5", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -311,7 +315,8 @@ use_instructions=True, adapted_from="nomic-ai/nomic-embed-text-v1-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -360,7 +365,8 @@ use_instructions=True, adapted_from="intfloat/e5-base-unsupervised", superseded_by="Snowflake/snowflake-arctic-embed-l-v2.0", - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -412,6 +418,7 @@ adapted_from=None, superseded_by="Snowflake/snowflake-arctic-embed-m-v2.0", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -437,7 +444,8 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-multilingual-base", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything @@ -485,7 +493,8 @@ use_instructions=True, adapted_from="BAAI/bge-m3-retromae", superseded_by=None, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets={ # source: https://arxiv.org/pdf/2405.05374 # splits not specified to assuming everything diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py index 79d220588a..91ff256bb8 100644 --- a/mteb/models/bge_models.py +++ b/mteb/models/bge_models.py @@ -4,6 +4,8 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + model_prompts = {"query": "Represent this sentence for searching relevant passages: "} BGE_15_CITATION = """@misc{bge_embedding, title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, @@ -15,8 +17,8 @@ }""" model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"} -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 +bge_m3_training_data = { + # source: https://arxiv.org/abs/2402.03216 "MIRACLRetrieval": ["train"], "MIRACLRetrievalHardNegatives": ["train"], "MIRACLReranking": ["train"], @@ -36,6 +38,28 @@ "HotpotQA": ["train"], "HotpotQA-PL": ["train"], # translation not trained on "HotpotQAHardNegatives": ["train"], + "T2Retrieval": ["train"], + "DuReader": ["train"], + "MMarcoReranking": ["train"], + "CodeSearchNet": ["train"], + # not in mteb + # "s2orc" + # Wikipedia + # "xP3" + # "mC4" + # "CC-News" + # "MTP" + # "NLLB" + # "CCMatrix" + # TriviaQA + # COL-IEE + # PubMedQA + # SQuAD + # SimCSE + # mMARCO-ZH + # LawGPT + # NLI-zh2, LeCaRDv2, + # NLI, MultiLongDoc (their syntetic) # + synthetic data } @@ -97,38 +121,6 @@ # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) } -bgem3_training_data = { - # source https://arxiv.org/abs/2402.03216 - "T2Retrieval": ["train"], - "DuReader": ["train"], - "MMarcoReranking": ["train"], - "CMedQAv2-reranking": ["train"], - "HotpotQA": ["train"], - "NQ": ["train"], - "MSMARCO": ["train"], - "MrTidyRetrieval": ["train"], - "MIRACLRetrieval": ["train"], - "CodeSearchNet": ["train"], - # not in mteb - # "s2orc" - # Wikipedia - # "xP3" - # "mC4" - # "CC-News" - # "MTP" - # "NLLB" - # "CCMatrix" - # TriviaQA - # COL-IEE - # PubMedQA - # SQuAD - # SimCSE - # mMARCO-ZH - # LawGPT - # NLI-zh2, LeCaRDv2, - # NLI, MultiLongDoc (their syntetic) -} - # https://huggingface.co/BAAI/bge-m3/discussions/29 bgem3_languages = [ "afr_Latn", # af @@ -306,59 +298,6 @@ "zho_Hans", # zh ] -bge_m_training_data = { - # source: https://arxiv.org/pdf/2402.03216 - "MIRACLRetrieval": ["train"], - "MIRACLRetrievalHardNegatives": ["train"], - "MIRACLReranking": ["train"], - "LeCaRDv2": ["train"], - "CMedQAv1-reranking": ["train"], - "CMedQAv2-reranking": ["train"], - "MrTidyRetrieval": ["train"], - "T2Reranking": ["train"], - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - "HotpotQA": ["train"], - "HotpotQA-PL": ["train"], # translation not trained on - "HotpotQAHardNegatives": ["train"], - # + synthetic data -} - -bge_training_data = { - # source: https://data.baai.ac.cn/details/BAAI-MTP - "NQ": ["test"], - "NQHardNegatives": ["test"], - "AmazonReviewsClassification": [ - "validation", - "test", - ], # assumed from: amazon_reviews_multi - "MLQARetrieval": [ - "validation", - "test", - ], # assumed from mlqa (question, context) - # not in mteb - # Dataset Pairs - # wudao (title, passage) - # cmrc2018 (query, context) - # dureader (query, context) - # simclue (sentence_a, sentence_b) - # csl (title, abstract) - # amazon_reviews_multi (title, body) - # wiki_atomic_edits (base_sentence, edited_sentence) - # mlqa (question, context) - # xlsum (title, summary) (title, text) - # "sentence-transformers data": [], # https://huggingface.co/datasets/sentence-transformers/embedding-training-data # TODO check this further - # "wikipedia": [], # title + section title, passage - # "reddit": [], # title, body - # "stackexchange": [], # (title, upvoted answer) (title+body, upvoted answer) - # "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract) -} bge_small_en_v1_5 = ModelMeta( loader=partial( # type: ignore @@ -380,7 +319,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, citation=BGE_15_CITATION, ) @@ -406,6 +346,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, citation=BGE_15_CITATION, ) @@ -432,6 +373,7 @@ use_instructions=True, citation=BGE_15_CITATION, public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_data="https://data.baai.ac.cn/details/BAAI-MTP", training_datasets=bge_training_data, ) @@ -455,7 +397,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -479,7 +422,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -503,7 +447,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - public_training_code=None, # seemingly released (at least for some models, but the link is broken + public_training_code=None, + public_training_data=None, training_datasets=bge_chinese_training_data, ) @@ -527,7 +472,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=bgem3_training_data, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_m3_training_data, ) @@ -560,5 +506,86 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # not disclosed ) + +# Contents of cfli/bge-full-data +bge_full_data = { + # source: https://arxiv.org/pdf/2409.15700 + # Charles Goodhart is turning back and forth + # in his grave as I'm annotating this + # |Retrieval| + # ELI5 + # SQuaD + # TriviaQA + # QuoraDuplicateQuestions + "HotpotQA": ["train"], + "FEVER": ["train"], + "MSMARCO": ["train"], + "NQ": ["train"], + "ArguAna": ["train"], + "FiQA2018": ["train"], + # |Reranking| + "SciDocsReranking": ["train"], + "StackOverflowDupQuestions": ["train"], + # |Classification| + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ImdbClassification": ["train"], + "ToxicConversationsClassification": ["train"], + # |Clustering| + "ArxivClusteringS2S": ["train"], + "ArxivClusteringP2P": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringP2P": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringP2P": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "RedditClusteringP2P": ["train"], + "RedditClustering": ["train"], + "RedditClustering.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + # |STS| + "STS22": ["train"], + "STS22.v2": ["train"], + "STSBenchmark": ["train"], +} + +bge_en_icl = ModelMeta( + loader=partial( + sentence_transformers_loader, + model_name="BAAI/bge-en-icl", + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + ), + name="BAAI/bge-en-icl", + languages=[ + "eng_Latn", + ], + open_weights=True, + revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5", + release_date="2024-07-25", # initial commit of hf model. + n_parameters=7.11 * 1e9, + embed_dim=4096, + license="apache-2", + max_tokens=32768, + reference="https://huggingface.co/BAAI/bge-en-icl", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code="https://github.com/FlagOpen/FlagEmbedding", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={ + **E5_MISTRAL_TRAINING_DATA, + **bge_full_data, + }, + adapted_from="intfloat/e5-mistral-7b-instruct", +) diff --git a/mteb/models/bm25.py b/mteb/models/bm25.py index ea56fd432b..6e3d3747d9 100644 --- a/mteb/models/bm25.py +++ b/mteb/models/bm25.py @@ -139,5 +139,6 @@ def encode(self, texts: list[str], **kwargs): framework=[], use_instructions=False, public_training_code="https://github.com/xhluca/bm25s", + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/cde_models.py b/mteb/models/cde_models.py new file mode 100644 index 0000000000..78870ef129 --- /dev/null +++ b/mteb/models/cde_models.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +logger = logging.getLogger(__name__) + + +cde_small_v1 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v1", + languages=["eng_Latn"], + open_weights=True, + revision="8d5736163718a8b65cd787b75ed61020d18bad3c", + release_date="2024-09-24", + n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="nomic-ai/nomic-bert-2048", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) + +cde_small_v2 = ModelMeta( + loader=None, # I will leave this at None for now, + name="jxm/cde-small-v2", + languages=["eng_Latn"], + open_weights=True, + revision="a7e5882ad52c27ea2831fc8258f24379c25cb459", + release_date="2025-01-13", + n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M + max_tokens=512, + embed_dim=768, + license="mit", + similarity_fn_name="cosine", + framework=["Sentence Transformers"], + reference="https://huggingface.co/jxm/cde-small-v1", + use_instructions=True, + adapted_from="answerdotai/ModernBERT-base", + superseded_by="jxm/cde-small-v2", + training_datasets=bge_full_data, + public_training_code="https://github.com/jxmorris12/cde", + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", +) diff --git a/mteb/models/cohere_models.py b/mteb/models/cohere_models.py index 8718a2e2a3..60ff63ee81 100644 --- a/mteb/models/cohere_models.py +++ b/mteb/models/cohere_models.py @@ -234,7 +234,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -257,7 +258,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -280,7 +282,8 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -303,6 +306,7 @@ def encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py index 0a8c0e4a57..6c29510855 100644 --- a/mteb/models/colbert_models.py +++ b/mteb/models/colbert_models.py @@ -156,6 +156,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="c1e84128e85ef755c096a95bdb06b47793b13acf", public_training_code=None, + public_training_data=None, release_date="2024-09-21", n_parameters=110 * 1e6, max_tokens=180, # Reduced for Benchmarking - see ColBERT paper @@ -167,7 +168,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], # dev? + }, ) @@ -208,6 +211,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: open_weights=True, revision="4cf816e5e2b03167b132a3c847a9ecd48ba708e1", public_training_code=None, + public_training_data=None, release_date="2024-08-16", n_parameters=559 * 1e6, max_tokens=8192, @@ -219,5 +223,9 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray: use_instructions=False, adapted_from=None, superseded_by=None, - training_datasets=None, + training_datasets={ + "MSMARCO": ["train"], + "DuRetrieval": [], + "MIRACL": ["train"], + }, ) diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 1d457652a7..58afc17976 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -15,6 +15,16 @@ E5_INSTRUCTION = "Instruct: {instruction}\nQuery: " +E5_MISTRAL_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_instruct = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -46,6 +56,7 @@ year={2024} }""", public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -91,5 +102,81 @@ } """, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) + +zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( + loader=partial( # type: ignore + instruct_wrapper, + model_name_or_path="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + instruction_template=E5_INSTRUCTION, + attn="cccc", + pooling_method="lasttoken", + mode="embedding", + torch_dtype=torch.bfloat16, + # The ST script does not normalize while the HF one does so unclear what to do + # https://huggingface.co/intfloat/e5-mistral-7b-instruct#transformers + normalized=True, + ), + name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + revision="c791d37474fa6a5c72eb3a2522be346bc21fbfc3", + release_date="2024-08-30", + languages=["eng_Latn"], + n_parameters=7110660096, + max_tokens=32768.0, + embed_dim=4096, + license="mit", + open_weights=True, + public_training_data=None, + public_training_code=None, + framework=["PyTorch"], + reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", + similarity_fn_name="cosine", + use_instructions=True, + training_datasets={ + # copied from e5 + # source: https://arxiv.org/pdf/2212.03533 + "NQ": ["test"], + "NQHardNegatives": ["test"], + "MSMARCO": ["train"], # dev? + # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be + # "Arguana", + # "FEVER", + # "FIQA", + # "HotPotQA", + # "MsMarco (passage)", + # "NFCorpus", + # "SciFact", + # "NLI", + # "SQuad", + # "StackExchange", + # "TriviaQA", + # "SciRep", + # "SciRepEval" + # mteb + # https://huggingface.co/datasets/mteb/raw_arxiv + # "ArxivClusteringS2S": ["train"], + # "ArxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_biorxiv + # "BiorxivClusteringS2S": ["train"], + # "BiorxivClusteringP2P": ["train"], + # https://huggingface.co/datasets/mteb/raw_medrxiv + # "MedrxivClusteringS2S": ["train"], + # "MedrxivClusteringP2P": ["train"], + # as their train datasets + "AmazonCounterfactualClassification": ["train"], + "AmazonReviewsClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ImdbClassification": ["train"], + "STS12": ["train"], + "STS22": ["train"], + "STSBenchmark": ["train"], + }, + adapted_from="intfloat/e5-mistral-7b-instruct", + superseded_by=None, +) diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index fe265f6f41..4c3c3d4790 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -144,6 +144,16 @@ "NQ-PL": ["train"], # translation not trained on } +ME5_TRAINING_DATA = { + **E5_TRAINING_DATA, + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + e5_mult_small = ModelMeta( loader=partial( # type: ignore sentence_transformers_loader, @@ -166,7 +176,8 @@ use_instructions=True, citation=MULTILINGUAL_E5_CITATION, public_training_code=None, # couldn't find - training_datasets=E5_TRAINING_DATA, + training_datasets=ME5_TRAINING_DATA, + public_training_data=None, ) e5_mult_base = ModelMeta( @@ -189,7 +200,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, ) @@ -214,7 +226,8 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets=E5_TRAINING_DATA, + public_training_data=None, + training_datasets=ME5_TRAINING_DATA, citation=MULTILINGUAL_E5_CITATION, ) @@ -238,6 +251,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -263,6 +277,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -291,6 +306,7 @@ adapted_from=None, citation=E5_CITATION, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, ) @@ -317,6 +333,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -344,6 +361,7 @@ superseded_by="intfloat/e5-large-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) @@ -371,6 +389,7 @@ superseded_by="intfloat/e5-base-v2", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=E5_TRAINING_DATA, citation=E5_CITATION, ) diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py new file mode 100644 index 0000000000..804dfbc84d --- /dev/null +++ b/mteb/models/gme_models.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +import logging + +from mteb.model_meta import ModelMeta + +logger = logging.getLogger(__name__) + + +gme_qwen2_vl_2b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="cfeb66885b598de483cc04eb08c7d9da534d7afe", + release_date="2024-12-21", + n_parameters=int(2.21 * 1e9), + max_tokens=32768, + embed_dim=1536, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) + +gme_qwen2_vl_7b_instruct = ModelMeta( + loader=None, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", + languages=["eng_Latn"], + open_weights=True, + revision="d42eca5a540526cfa982a349724b24b25c12a95e", + release_date="2024-12-21", + n_parameters=int(8.29 * 1e9), + max_tokens=32768, + embed_dim=3584, + license="mit", + similarity_fn_name="cosine", + framework=["PyTorch"], + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", + use_instructions=True, + adapted_from=None, + superseded_by=None, + training_datasets={ + # Only annotating text data for now + # source: https://arxiv.org/pdf/2412.16855 + "MSMARCO": ["train"], + "MSMARCO.v2": ["train"], + }, + public_training_code=None, + public_training_data=None, +) diff --git a/mteb/models/google_models.py b/mteb/models/google_models.py index 08065f7af0..40d316fee7 100644 --- a/mteb/models/google_models.py +++ b/mteb/models/google_models.py @@ -151,7 +151,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -173,7 +174,8 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) @@ -195,6 +197,7 @@ def encode( similarity_fn_name="cosine", # assumed framework=["API"], use_instructions=True, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index ab32a6a9a6..eb23ee66bf 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -11,6 +11,18 @@ logger = logging.getLogger(__name__) +GRIT_LM_TRAINING_DATA = { + **E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + # also uses medi2 which contains fever and hotpotqa: + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + + def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: return ( "<|user|>\n" + instruction + "\n<|embed|>\n" if instruction else "<|embed|>\n" @@ -50,9 +62,10 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 + training_datasets=GRIT_LM_TRAINING_DATA, # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, citation=GRITLM_CITATION, ) gritlm8x7b = ModelMeta( @@ -76,8 +89,9 @@ def gritlm_instruction(instruction: str = "", prompt_type=None) -> str: similarity_fn_name="cosine", framework=["GritLM", "PyTorch"], use_instructions=True, + training_datasets=GRIT_LM_TRAINING_DATA, citation=GRITLM_CITATION, - training_datasets=E5_TRAINING_DATA, # source https://arxiv.org/pdf/2402.09906 # section 3.1 "We finetune our final models from Mistral 7B [68] and Mixtral 8x7B [69] using adaptations of E5 [160] and the Tülu 2 data public_training_code="https://github.com/ContextualAI/gritlm", + public_training_data=None, ) diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py index f80dc01fdd..fb3bb6db3e 100644 --- a/mteb/models/gte_models.py +++ b/mteb/models/gte_models.py @@ -56,6 +56,7 @@ def instruction_template( use_instructions=True, citation=GTE_CITATION, public_training_code=None, + public_training_data=None, training_datasets=None, max_tokens=131072, ) @@ -87,6 +88,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -117,6 +119,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -140,6 +143,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -163,6 +167,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -186,6 +191,7 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # Not disclosed ) @@ -301,6 +307,7 @@ def instruction_template( similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, # couldn't find + public_training_code=None, + public_training_data=None, # couldn't find training_datasets=gte_multi_training_data, ) diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py index 78bad6097f..e7c3b8b022 100644 --- a/mteb/models/ibm_granite_models.py +++ b/mteb/models/ibm_granite_models.py @@ -20,6 +20,65 @@ "zho_Hans", ] +granite_training_data = { + # Multilingual MC4 + # Multilingual Webhose + # English Wikipedia + # Multilingual Wikimedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # Miracl Corpus (Title-Body) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (titles) + # Stack Exchange Duplicate questions (bodies) + "StackOverflowDupQuestions": [], + "AskUbuntuDupQuestions": [], + # Stack Exchange (Title, Answer) pairs + # Stack Exchange (Title, Body) pairs + # Stack Exchange (Title, Body) pairs + # Machine Translations of Stack Exchange Duplicate questions (titles) + # Machine Translations of Stack Exchange (Title+Body, Answer) pairs + "StackExchangeClusteringP2P": [], + "StackExchangeClusteringP2P.v2": [], + "StackExchangeClustering": [], + "StackExchangeClustering.v2": [], + # SearchQA + # S2ORC (Title, Abstract) + # WikiAnswers Duplicate question pairs + # CCNews + # XSum + # SimpleWiki + # Machine Translated Cross Lingual Parallel Corpora + # SPECTER citation triplets + # Machine Translations of SPECTER citation triplets + # Natural Questions (NQ) + "NQ": ["test"], + "NQHardNegatives": ["test"], + # SQuAD2.0 + # HotpotQA + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # Fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], + # PubMed + # Multilingual Miracl Triples + "MIRACLRetrieval": ["train"], + "MIRACLRetrievalHardNegatives": ["train"], + "MIRACLReranking": ["train"], + # Multilingual MrTydi Triples + "MrTidyRetrieval": ["train"], + # Sadeeem Question Asnwering + # DBPedia Title-Body Pairs + "DBPedia": ["train"], + # Synthetic: English Query-Wikipedia Passage + # Synthetic: English Fact Verification + # Synthetic: Multilingual Query-Wikipedia Passage + # Synthetic: Multilingual News Summaries + # IBM Internal Triples + # IBM Internal Title-Body Pairs +} granite_107m_multilingual = ModelMeta( loader=partial( # type: ignore @@ -42,8 +101,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_278m_multilingual = ModelMeta( @@ -67,8 +127,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_30m_english = ModelMeta( @@ -92,8 +153,9 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) granite_125m_english = ModelMeta( @@ -117,6 +179,7 @@ adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, use_instructions=False, - training_datasets=None, + training_datasets=granite_training_data, ) diff --git a/mteb/models/inf_models.py b/mteb/models/inf_models.py index dc31adccd2..0d40ff3ef2 100644 --- a/mteb/models/inf_models.py +++ b/mteb/models/inf_models.py @@ -26,5 +26,6 @@ use_instructions=True, adapted_from="Alibaba-NLP/gte-Qwen2-7B-instruct", public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index 1dc06d5640..dbd1615ad8 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -93,4 +93,5 @@ def encode( training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index f9b1f1b72a..41742a2ee3 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -222,8 +222,25 @@ def encode( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, reference="https://huggingface.co/jinaai/jina-embeddings-v3", - training_datasets=None, public_training_code=None, + public_training_data=None, + training_datasets={ + # CulturaX + "STS12": [], + # "SICK": [], + # "WMT19": [], + # "MADLAD-3B": [], + # NLI + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # oasst1, oasst2 + }, + adapted_from="XLM-RoBERTa", citation=""" @misc{sturua2024jinaembeddingsv3multilingualembeddingstask, title={jina-embeddings-v3: Multilingual Embeddings With Task LoRA}, @@ -256,6 +273,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embeddings_v2_small_en = ModelMeta( @@ -276,6 +294,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_b_en_v1 = ModelMeta( @@ -296,6 +315,7 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) jina_embedding_s_en_v1 = ModelMeta( @@ -316,4 +336,5 @@ def encode( adapted_from=None, training_datasets=None, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py new file mode 100644 index 0000000000..380724e53e --- /dev/null +++ b/mteb/models/lens_models.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.model_meta import ModelMeta + +from .bge_models import bge_full_data + +lens_d4000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d4000", + languages=None, + open_weights=True, + revision="e473b33364e6c48a324796fd1411d3b93670c6fe", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=4000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d4000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) + +lens_d8000 = ModelMeta( + loader=None, # TODO: implement this in the future + name="yibinlei/LENS-d8000", + languages=None, + open_weights=True, + revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef", + release_date="2025-01-17", + n_parameters=int(7.11 * 1e9), + embed_dim=8000, + license="apache-2.0", + reference="https://huggingface.co/yibinlei/LENS-d8000", + similarity_fn_name="cosine", + framework=["PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets=bge_full_data, + max_tokens=32768, +) diff --git a/mteb/models/linq_models.py b/mteb/models/linq_models.py index 11cfa74ed1..ead10ebf71 100644 --- a/mteb/models/linq_models.py +++ b/mteb/models/linq_models.py @@ -40,5 +40,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/llm2vec_models.py b/mteb/models/llm2vec_models.py index 7083534751..82186309db 100644 --- a/mteb/models/llm2vec_models.py +++ b/mteb/models/llm2vec_models.py @@ -138,6 +138,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -165,6 +166,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) @@ -192,6 +194,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_mistral7b_unsupervised = ModelMeta( @@ -218,6 +221,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) llm2vec_llama2_7b_supervised = ModelMeta( @@ -244,6 +248,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_llama2_7b_unsupervised = ModelMeta( @@ -269,6 +274,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, citation=LLM2VEC_CITATION, ) @@ -296,6 +302,7 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets=llm2vec_supervised_training_data, + public_training_data=None, ) llm2vec_sheared_llama_unsupervised = ModelMeta( @@ -322,4 +329,5 @@ def loader_inner(**kwargs: Any) -> Encoder: citation=LLM2VEC_CITATION, public_training_code="https://github.com/McGill-NLP/llm2vec/tree/250292a307428240d801fadd85825464e71c3277/train_configs", training_datasets={}, + public_training_data=None, ) diff --git a/mteb/models/misc_models.py b/mteb/models/misc_models.py index 5233ecec6b..140d8bac74 100644 --- a/mteb/models/misc_models.py +++ b/mteb/models/misc_models.py @@ -7,7 +7,7 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader from mteb.models.e5_models import E5_TRAINING_DATA -from .bge_models import bge_m_training_data, bge_training_data +from .bge_models import bge_m3_training_data, bge_training_data from .sentence_transformers_models import sent_trf_training_dataset Haon_Chen__speed_embedding_7b_instruct = ModelMeta( @@ -22,6 +22,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Haon-Chen/speed-embedding-7b-instruct", similarity_fn_name="cosine", @@ -42,6 +43,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Gameselo/STS-multilingual-mpnet-base-v2", similarity_fn_name="cosine", @@ -62,6 +64,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1", similarity_fn_name="cosine", @@ -82,6 +85,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/HIT-TMG/KaLM-embedding-multilingual-mini-v1", similarity_fn_name="cosine", @@ -102,6 +106,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/Hum-Works/lodestone-base-4096-v1", similarity_fn_name="cosine", @@ -164,6 +169,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Jaume/gemma-2b-embeddings", similarity_fn_name="cosine", @@ -184,6 +190,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/BeastyZ/e5-R-mistral-7b", similarity_fn_name="cosine", @@ -210,6 +217,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-base", similarity_fn_name="cosine", @@ -235,6 +243,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-large", similarity_fn_name="cosine", @@ -260,6 +269,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Lajavaness/bilingual-embedding-small", similarity_fn_name="cosine", @@ -280,6 +290,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Bulbasaur", similarity_fn_name="cosine", @@ -301,6 +312,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Ivysaur", similarity_fn_name="cosine", @@ -322,6 +334,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Squirtle", similarity_fn_name="cosine", @@ -343,6 +356,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Venusaur", similarity_fn_name="cosine", @@ -364,6 +378,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/Wartortle", similarity_fn_name="cosine", @@ -385,6 +400,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro", similarity_fn_name="cosine", @@ -405,6 +421,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Mihaiii/gte-micro-v4", similarity_fn_name="cosine", @@ -425,6 +442,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrdalieTech/Solon-embeddings-large-0.1", similarity_fn_name="cosine", @@ -445,6 +463,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -465,6 +484,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-MiniLM-L12-v2-all-nli-triplet", similarity_fn_name="cosine", @@ -487,6 +507,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -509,6 +530,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-labse-Matryoshka", similarity_fn_name="cosine", @@ -531,6 +553,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet", similarity_fn_name="cosine", @@ -553,6 +576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka", similarity_fn_name="cosine", @@ -573,6 +597,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-lunaris-text-embeddings", similarity_fn_name="cosine", @@ -593,6 +618,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/consciousAI/cai-stellaris-text-embeddings", similarity_fn_name="cosine", @@ -613,6 +639,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/bge-m3-custom-fr", similarity_fn_name="cosine", @@ -633,6 +660,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.2", similarity_fn_name="cosine", @@ -653,6 +681,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.3", similarity_fn_name="cosine", @@ -673,6 +702,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/manu/sentence_croissant_alpha_v0.4", similarity_fn_name="cosine", @@ -694,6 +724,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-base", similarity_fn_name="cosine", @@ -714,6 +745,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-large", similarity_fn_name="cosine", @@ -734,6 +766,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/thenlper/gte-small", similarity_fn_name="cosine", @@ -754,6 +787,7 @@ license="gpl-3.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/KartonBERT-USE-base-v1", similarity_fn_name="cosine", @@ -774,6 +808,7 @@ license="lgpl", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/OrlikB/st-polish-kartonberta-base-alpha-v1", similarity_fn_name="cosine", @@ -794,6 +829,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-base", similarity_fn_name="cosine", @@ -814,6 +850,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/dwzhu/e5-base-4k", similarity_fn_name="cosine", @@ -834,6 +871,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-large", similarity_fn_name="cosine", @@ -854,6 +892,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-e5-small", similarity_fn_name="cosine", @@ -874,6 +913,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-base", similarity_fn_name="cosine", @@ -894,6 +934,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/sdadas/mmlw-roberta-large", similarity_fn_name="cosine", @@ -960,6 +1001,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-1b1", similarity_fn_name="cosine", @@ -1026,6 +1068,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-3b", similarity_fn_name="cosine", @@ -1092,6 +1135,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-560m", similarity_fn_name="cosine", @@ -1158,6 +1202,7 @@ license="bigscience-bloom-rail-1.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/izhx/udever-bloom-7b1", similarity_fn_name="cosine", @@ -1178,6 +1223,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-Embedding-v0", similarity_fn_name="cosine", @@ -1198,6 +1244,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-all-MiniLM-L6-v2", similarity_fn_name="cosine", @@ -1218,6 +1265,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-large-Embedding-v0", similarity_fn_name="cosine", @@ -1238,6 +1286,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/GIST-small-Embedding-v0", similarity_fn_name="cosine", @@ -1258,6 +1307,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/bigscience/sgpt-bloom-7b1-msmarco", similarity_fn_name="cosine", @@ -1278,6 +1328,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/aari1995/German_Semantic_STS_V2", similarity_fn_name="cosine", @@ -1299,6 +1350,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/abhinand/MedEmbed-small-v0.1", similarity_fn_name="cosine", @@ -1325,6 +1377,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/avsolatorio/NoInstruct-small-Embedding-v0", similarity_fn_name="cosine", @@ -1345,6 +1398,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/brahmairesearch/slx-v0.1", similarity_fn_name="cosine", @@ -1365,6 +1419,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/deepfile/embedder-100p", similarity_fn_name="cosine", @@ -1385,11 +1440,12 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/deepvk/USER-bge-m3", similarity_fn_name="cosine", use_instructions=None, - training_datasets=bge_m_training_data, # derived from. + training_datasets=bge_m3_training_data, # derived from. # not in MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -1416,6 +1472,7 @@ license="mit", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/infgrad/stella-base-en-v2", similarity_fn_name="cosine", @@ -1436,6 +1493,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch"], reference="https://huggingface.co/malenia1/ternary-weight-embedding", similarity_fn_name="cosine", @@ -1456,6 +1514,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/omarelshehy/arabic-english-sts-matryoshka", similarity_fn_name="cosine", @@ -1486,6 +1545,7 @@ license=None, open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/openbmb/MiniCPM-Embedding", similarity_fn_name="cosine", @@ -1516,6 +1576,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/shibing624/text2vec-base-multilingual", similarity_fn_name="cosine", @@ -1537,6 +1598,7 @@ license="apache-2.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/silma-ai/silma-embeddding-matryoshka-v0.1", similarity_fn_name="cosine", @@ -1545,26 +1607,7 @@ adapted_from="/workspace/v3-matryoshka_aubmindlab-bert-base-arabertv02-2024-10-12_13-55-06/checkpoint-26250", superseded_by=None, ) -zeta_alpha_ai__Zeta_Alpha_E5_Mistral = ModelMeta( - name="zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - revision="3e6076bdc2ff592a2f95fbc04570e51db5aa0c0c", - release_date="2024-08-30", - languages=["eng_Latn"], - loader=None, - n_parameters=7110660096, - max_tokens=32768.0, - embed_dim=4096, - license="mit", - open_weights=True, - public_training_code=None, - framework=["PyTorch"], - reference="https://huggingface.co/zeta-alpha-ai/Zeta-Alpha-E5-Mistral", - similarity_fn_name="cosine", - use_instructions=None, - training_datasets=None, - adapted_from="intfloat/e5-mistral-7b-instruct", - superseded_by=None, -) + sbert_chinese_general_v1 = ModelMeta( name="DMetaSoul/sbert-chinese-general-v1", revision="bd27765956bcc2fcf682de0097819947ac10037e", @@ -1577,6 +1620,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/sbert-chinese-general-v1", similarity_fn_name="cosine", @@ -1601,6 +1645,7 @@ license="apache-2", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/DMetaSoul/Dmeta-embedding-zh-small/", similarity_fn_name="cosine", @@ -1620,6 +1665,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding", similarity_fn_name="cosine", @@ -1640,6 +1686,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/lier007/xiaobu-embedding-v2", similarity_fn_name="cosine", @@ -1660,6 +1707,7 @@ license="not specified", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1680,6 +1728,7 @@ license="cc-by-nc-4.0", open_weights=True, public_training_code=None, + public_training_data=None, framework=["PyTorch", "Sentence Transformers"], reference="https://huggingface.co/Classical/Yinka", similarity_fn_name="cosine", @@ -1688,3 +1737,41 @@ training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage superseded_by=None, ) +ember_v1 = ModelMeta( + name="llmrails/ember-v1", + revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d", + release_date="2023-10-10", + languages=["eng_Latn"], + n_parameters=335 * 1e6, + max_tokens=512, + embed_dim=1024, + license="mit", + open_weights=True, + public_training_code=None, + public_training_data=None, + framework=["PyTorch", "Sentence Transformers"], + reference="https://huggingface.co/llmrails/ember-v1", + similarity_fn_name="cosine", + use_instructions=None, + training_datasets=None, + superseded_by=None, +) +amazon_titan_text_embeddings_v2 = ModelMeta( + name="amazon/Titan-text-embeddings-v2", + revision="1", + release_date="2024-04-30", + languages=["eng_Latn"], + n_parameters=None, + max_tokens=None, + embed_dim=None, + license="proprietary", + open_weights=False, + public_training_code=None, + public_training_data=None, + framework=[], + reference="https://huggingface.co/amazon/Titan-text-embeddings-v2", + similarity_fn_name="cosine", + use_instructions=False, + training_datasets=None, + superseded_by=None, +) diff --git a/mteb/models/model2vec_models.py b/mteb/models/model2vec_models.py index afbf9df627..33da211c7a 100644 --- a/mteb/models/model2vec_models.py +++ b/mteb/models/model2vec_models.py @@ -75,7 +75,8 @@ def encode( adapted_from="BAAI/bge-base-en-v1.5", superseded_by=None, training_datasets=bge_training_data, # distilled - public_training_code="https://github.com/MinishLab/model2vec", # + public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) @@ -101,6 +102,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_base_output = ModelMeta( @@ -125,6 +127,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) m2v_multilingual_output = ModelMeta( @@ -149,6 +152,7 @@ def encode( superseded_by=None, training_datasets=None, public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_2m = ModelMeta( @@ -173,6 +177,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_4m = ModelMeta( @@ -197,6 +202,7 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) potion_base_8m = ModelMeta( @@ -221,4 +227,5 @@ def encode( superseded_by=None, training_datasets=bge_training_data, # distilled public_training_code="https://github.com/MinishLab/model2vec", + public_training_data=None, ) diff --git a/mteb/models/moka_models.py b/mteb/models/moka_models.py index d3943d78d7..1504b40789 100644 --- a/mteb/models/moka_models.py +++ b/mteb/models/moka_models.py @@ -96,7 +96,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -117,7 +118,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) @@ -139,6 +141,7 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Not published + public_training_code=None, + public_training_data=None, # Not published training_datasets=m3e_dataset, ) diff --git a/mteb/models/mxbai_models.py b/mteb/models/mxbai_models.py index e0be5c9d93..c4bc7c3db8 100644 --- a/mteb/models/mxbai_models.py +++ b/mteb/models/mxbai_models.py @@ -42,5 +42,6 @@ } """, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/no_instruct_sentence_models.py b/mteb/models/no_instruct_sentence_models.py index a0596b9bd1..9ff5cf901f 100644 --- a/mteb/models/no_instruct_sentence_models.py +++ b/mteb/models/no_instruct_sentence_models.py @@ -100,5 +100,6 @@ def encode( # type: ignore adapted_from=None, superseded_by=None, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/nomic_models.py b/mteb/models/nomic_models.py index b2b0542543..c2d06e2f6e 100644 --- a/mteb/models/nomic_models.py +++ b/mteb/models/nomic_models.py @@ -90,6 +90,79 @@ def encode( # type: ignore return emb +nomic_training_data = { + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/contrastive_pretrain.yaml + # reddit_title_body + "RedditClustering": [], + "RedditClusteringP2P": [], + "RedditClustering.v2": [], + "RedditClusteringP2P.v2": [], + # amazon_reviews + # amazonqa + "AmazonPolarityClassification": [], + "AmazonReviewsClassification": [], + "AmazonCounterfactualClassification": [], + # paq + # s2orc_citation_titles + # s2orc_title_abstract + # s2orc_abstract_citation + # s2orc_abstract_body + # wikianswers + # wikipedia + "WikipediaRetrievalMultilingual": [], + "WikipediaRerankingMultilingual": [], + # gooaq + # codesearch + "CodeSearchNetCCRetrieval": [], + "COIRCodeSearchNetRetrieval": [], + # yahoo_title_answer + # yahoo_qa + # yahoo_title_question + "YahooAnswersTopicsClassification": [], + # agnews + # ccnews + # npr + # eli5 + # cnn + # stackexchange_duplicate_questions + # stackexchange_title_body + # stackexchange_body_body + "StackExchangeClustering.v2": [], + "StackExchangeClusteringP2P.v2": [], + # sentence_compression + # wikihow + # altlex + # quora + "QuoraRetrieval": [], + "NanoQuoraRetrieval": [], + # simplewiki + # squad + "FQuADRetrieval": [], + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/data/finetune_triplets.yaml + # msmaro + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + # nq_triples + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + # nli_triplets + # reddit + # medi_wiki + # medi_stackexchange + # medi_flickr + # medi_supernli + # hotpot + "HotPotQA": ["test"], + "HotPotQAHardNegatives": ["test"], + "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) + # fever + "FEVER": ["test"], + "FEVERHardNegatives": ["test"], +} + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/eval/mteb_eval/eval_mteb.py#L142-L159 model_prompts = { "Classification": "classification: ", @@ -138,8 +211,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_data=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, ) nomic_embed_v1 = ModelMeta( @@ -166,8 +240,9 @@ def encode( # type: ignore citation=NOMIC_CITATION, adapted_from=None, superseded_by="nomic-ai/nomic-embed-text-v1.5", - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_embed_v1_ablated = ModelMeta( @@ -193,8 +268,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) @@ -221,8 +297,9 @@ def encode( # type: ignore use_instructions=True, adapted_from=None, superseded_by=None, - public_training_code=None, - training_datasets=None, + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune.yaml", + training_datasets=nomic_training_data, + public_training_data=None, ) nomic_modern_bert_embed = ModelMeta( @@ -248,8 +325,10 @@ def encode( # type: ignore similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=True, - adapted_from=None, + adapted_from="answerdotai/ModernBERT-base", + public_training_code="https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_pretrain_modernbert.yaml", + # https://github.com/nomic-ai/contrastors/blob/5f7b461e5a13b5636692d1c9f1141b27232fe966/src/contrastors/configs/train/contrastive_finetune_modernnomic.yaml superseded_by=None, - public_training_code=None, - training_datasets=None, + training_datasets=nomic_training_data, + public_training_data=None, ) diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1f345a62be..1997a85274 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -141,6 +141,7 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) NV_embed_v1 = ModelMeta( @@ -164,4 +165,5 @@ def encode( use_instructions=True, training_datasets=nvidia_training_datasets, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/openai_models.py b/mteb/models/openai_models.py index 863c9d7828..079e7c9361 100644 --- a/mteb/models/openai_models.py +++ b/mteb/models/openai_models.py @@ -135,7 +135,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: similarity_fn_name="cosine", framework=["API"], use_instructions=False, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, ) text_embedding_3_large = ModelMeta( @@ -156,7 +157,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, @@ -179,7 +181,8 @@ def _to_numpy(self, embedding_response) -> np.ndarray: framework=["API"], use_instructions=False, n_parameters=None, - public_training_code=None, # assumed + public_training_code=None, + public_training_data=None, # assumed training_datasets=None, license=None, similarity_fn_name=None, diff --git a/mteb/models/overview.py b/mteb/models/overview.py index c72fe2ed89..e23285ff68 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -15,10 +15,12 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, e5_models, + gme_models, google_models, gritlm_models, gte_models, @@ -26,6 +28,7 @@ inf_models, jasper_models, jina_models, + lens_models, linq_models, llm2vec_models, misc_models, @@ -56,6 +59,7 @@ arctic_models, bge_models, bm25, + cde_models, cohere_models, colbert_models, e5_instruct, @@ -64,9 +68,11 @@ google_models, gritlm_models, gte_models, + gme_models, ibm_granite_models, inf_models, jina_models, + lens_models, linq_models, llm2vec_models, mxbai_models, @@ -210,6 +216,25 @@ def get_model_meta(model_name: str, revision: str | None = None) -> ModelMeta: return meta +empty_model_meta = ModelMeta( + name=None, + revision=None, + languages=None, + release_date=None, + n_parameters=None, + max_tokens=None, + embed_dim=None, + license=None, + open_weights=True, + public_training_code=None, + public_training_data=None, + similarity_fn_name=None, + use_instructions=None, + training_datasets=None, + framework=[], +) + + @lru_cache def model_meta_from_hf_hub(model_name: str) -> ModelMeta: try: @@ -234,26 +259,14 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: embed_dim=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, ) except Exception as e: logger.warning(f"Failed to extract metadata from model: {e}.") - return ModelMeta( - name=model_name, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta + meta.name = model_name + return meta def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: @@ -273,6 +286,7 @@ def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -280,22 +294,7 @@ def model_meta_from_cross_encoder(model: CrossEncoder) -> ModelMeta: logger.warning( f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta return meta @@ -325,6 +324,7 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe license=None, open_weights=True, public_training_code=None, + public_training_data=None, use_instructions=None, training_datasets=None, ) @@ -332,20 +332,5 @@ def model_meta_from_sentence_transformers(model: SentenceTransformer) -> ModelMe logger.warning( f"Failed to extract metadata from model: {e}. Upgrading to sentence-transformers v3.0.0 or above is recommended." ) - meta = ModelMeta( - name=None, - revision=None, - languages=None, - release_date=None, - n_parameters=None, - max_tokens=None, - embed_dim=None, - license=None, - open_weights=True, - public_training_code=None, - similarity_fn_name=None, - use_instructions=None, - training_datasets=None, - framework=[], - ) + meta = empty_model_meta return meta diff --git a/mteb/models/piccolo_models.py b/mteb/models/piccolo_models.py index bb92b55673..d51487b8ba 100644 --- a/mteb/models/piccolo_models.py +++ b/mteb/models/piccolo_models.py @@ -21,6 +21,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't specify ) @@ -42,5 +43,6 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets=None, # They don't say ) diff --git a/mteb/models/promptriever_models.py b/mteb/models/promptriever_models.py index 7dc98a26a5..df2204defe 100644 --- a/mteb/models/promptriever_models.py +++ b/mteb/models/promptriever_models.py @@ -80,6 +80,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) promptriever_llama3 = ModelMeta( @@ -107,6 +108,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) @@ -135,6 +137,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) promptriever_mistral_v1 = ModelMeta( @@ -162,4 +165,5 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=PROMPTRIEVER_CITATION, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/repllama_models.py b/mteb/models/repllama_models.py index 8faa2c490f..ffe1f0bd87 100644 --- a/mteb/models/repllama_models.py +++ b/mteb/models/repllama_models.py @@ -172,6 +172,7 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=REPLLAMA_CITATION, public_training_code=None, + public_training_data=None, ) @@ -199,5 +200,6 @@ def loader_inner(**kwargs: Any) -> Encoder: use_instructions=True, citation=REPLLAMA_CITATION, public_training_code=None, + public_training_data=None, training_datasets=None, ) diff --git a/mteb/models/rerankers_custom.py b/mteb/models/rerankers_custom.py index bedfd09604..34adea7ffd 100644 --- a/mteb/models/rerankers_custom.py +++ b/mteb/models/rerankers_custom.py @@ -11,6 +11,7 @@ from mteb.encoder_interface import Encoder from mteb.evaluation.evaluators.RetrievalEvaluator import DenseRetrievalExactSearch from mteb.model_meta import ModelMeta +from mteb.models.bge_models import bge_m3_training_data logger = logging.getLogger(__name__) @@ -209,6 +210,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -233,6 +235,7 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -289,9 +292,10 @@ def loader_inner(**kwargs: Any) -> Encoder: embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, - training_datasets=None, + training_datasets=bge_m3_training_data, framework=["Sentence Transformers", "PyTorch"], citation=""" @misc{li2023making, diff --git a/mteb/models/rerankers_monot5_based.py b/mteb/models/rerankers_monot5_based.py index f45addb18f..320ee4bc7d 100644 --- a/mteb/models/rerankers_monot5_based.py +++ b/mteb/models/rerankers_monot5_based.py @@ -301,6 +301,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -342,6 +343,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -365,6 +367,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -397,6 +400,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -452,6 +456,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -497,6 +502,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -542,6 +548,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -587,6 +594,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -610,6 +618,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -651,6 +660,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -674,6 +684,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, @@ -707,6 +718,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -854,6 +866,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, framework=["PyTorch"], @@ -876,6 +889,7 @@ def get_prediction_tokens(self, *args, **kwargs): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/mteb/models/ru_sentence_models.py b/mteb/models/ru_sentence_models.py index 1869ce62db..683c8c5024 100644 --- a/mteb/models/ru_sentence_models.py +++ b/mteb/models/ru_sentence_models.py @@ -6,42 +6,53 @@ from mteb.model_meta import ModelMeta, sentence_transformers_loader -from .bge_models import bge_training_data +from .bge_models import bge_m3_training_data -rubert_tiny2 = ModelMeta( - name="cointegrated/rubert-tiny2", +rubert_tiny = ModelMeta( + name="cointegrated/rubert-tiny", languages=["rus_Cyrl"], open_weights=True, - revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", - release_date="2021-10-28", + revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", + release_date="2021-05-24", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny2", + reference="https://huggingface.co/cointegrated/rubert-tiny", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, - training_datasets=None, + public_training_code="https://gist.github.com/avidale/7bc6350f26196918bf339c01261f5c60", + training_datasets={ + # [Yandex Translate corpus](https://translate.yandex.ru/corpus), [OPUS-100](https://huggingface.co/datasets/opus100) + "Tatoeba": ["train"], + }, + adapted_from="google-bert/bert-base-multilingual-cased", + public_training_data=None, ) -rubert_tiny = ModelMeta( - name="cointegrated/rubert-tiny", +rubert_tiny2 = ModelMeta( + name="cointegrated/rubert-tiny2", languages=["rus_Cyrl"], open_weights=True, - revision="5441c5ea8026d4f6d7505ec004845409f1259fb1", - release_date="2021-05-24", + revision="dad72b8f77c5eef6995dd3e4691b758ba56b90c3", + release_date="2021-10-28", n_parameters=29_400_000, embed_dim=312, license="mit", max_tokens=2048, - reference="https://huggingface.co/cointegrated/rubert-tiny", + reference="https://huggingface.co/cointegrated/rubert-tiny2", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, - training_datasets=None, + public_training_code="https://colab.research.google.com/drive/1mSWfIQ6PIlteLVZ9DKKpcorycgLIKZLf?usp=sharing", + training_datasets={ + # https://huggingface.co/datasets/cointegrated/ru-paraphrase-NMT-Leipzig + # Wikipedia https://huggingface.co/datasets/Madjogger/JamSpell_dataset + # https://huggingface.co/datasets/imvladikon/leipzig_corpora_collection + }, + adapted_from="cointegrated/rubert-tiny", + public_training_data=None, ) sbert_large_nlu_ru = ModelMeta( @@ -59,6 +70,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -77,7 +89,11 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=None, + public_training_data=None, + training_datasets={ + # SNLI, MNLI + # https://github.com/brmson/dataset-sts + }, ) user_base_ru = ModelMeta( @@ -93,12 +109,13 @@ revision="436a489a2087d61aa670b3496a9915f84e46c861", release_date="2024-06-10", n_parameters=427_000_000, - embed_dim=1024, - license="Not specified", - max_tokens=512, # best guess - reference="https://huggingface.co/ai-forever/sbert_large_mt_nlu_ru", + embed_dim=768, + license="apache-2.0", + max_tokens=512, + reference="https://huggingface.co/deepvk/USER-base", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/deepvk/deberta-v1-base", use_instructions=True, citation="""@misc{deepvk2024user, title={USER: Universal Sentence Encoder for Russian}, @@ -108,13 +125,68 @@ year={2024}, } """, + training_datasets={ + "BibleNLPBitextMining": ["train"], + # https://github.com/unicamp-dl/mMARCO + # deepvk/ru-HNP + # deepvk/ru-WANLI + # MedNLI + # RCB + "TERRa": ["train"], + # Tapaco + # Opus100 + # BiblePar + # RudetoxifierDataDetox + # RuParadetox + "MIRACL": ["train"], + # MLDR + # Lenta + "MLSUMClusteringP2P": ["train"], + "MLSUMClusteringP2P.v2": ["train"], + "MLSUMClusteringS2S": ["train"], + "MLSUMClusteringS2S.v2": ["train"], + "MrTidyRetrieval": ["train"], + # "Panorama" + # PravoIsrael + # xlsum + # Fialka-v1 + # RussianKeywords + # Gazeta + # Gsm8k-ru + # DSumRu + # SummDialogNews + }, + public_training_code=None, + public_training_data=None, +) + +user_bge_m3 = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="deepvk/USER-bge-m3", + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + ), + name="deepvk/USER-bge-m3", + languages=["rus_Cyrl"], + open_weights=True, + revision="0cc6cfe48e260fb0474c753087a69369e88709ae", + release_date="2024-07-05", + n_parameters=359_026_688, + embed_dim=1024, + license="apache-2.0", + max_tokens=8194, + reference="https://huggingface.co/deepvk/USER-base", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + adapted_from="https://huggingface.co/BAAI/bge-m3", + use_instructions=False, training_datasets={ "BibleNLPBitextMining": ["train"], "MLSUMClusteringP2P": ["train"], "MLSUMClusteringP2P.v2": ["train"], "MLSUMClusteringS2S": ["train"], "MLSUMClusteringS2S.v2": ["train"], - **bge_training_data, + **bge_m3_training_data, # not MTEB: # "deepvk/ru-HNP": ["train"], # "deepvk/ru-WANLI": ["train"], @@ -132,6 +204,7 @@ # "CarlBrendt/Summ_Dialog_News": ["train"], }, public_training_code=None, + public_training_data=None, ) deberta_v1_ru = ModelMeta( @@ -148,7 +221,9 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, + # Wikipedia, Books, Twitter comments, Pikabu, Proza.ru, Film subtitles, News websites, and Social corpus public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -161,12 +236,13 @@ n_parameters=1280_000_000, embed_dim=768, license="Not specified", - max_tokens=512, # best guess + max_tokens=512, reference="https://huggingface.co/DeepPavlov/rubert-base-cased", similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, citation="""@misc{kuratov2019adaptationdeepbidirectionalmultilingual, title={Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language}, @@ -194,6 +270,7 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, citation="""@misc{https://doi.org/10.48550/arxiv.2205.02340, doi = {10.48550/ARXIV.2205.02340}, @@ -222,7 +299,11 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, - training_datasets=None, + public_training_data=None, + training_datasets={ + # "SNLI": [], + "XNLI": ["dev"] + }, ) labse_en_ru = ModelMeta( @@ -239,8 +320,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - public_training_code=None, + public_training_code="https://colab.research.google.com/drive/1dnPRn0-ugj3vZgSpyCC9sgslM2SuSfHy?usp=sharing", + public_training_data=None, training_datasets=None, + adapted_from="sentence-transformers/LaBSE", ) rubert_tiny_turbo = ModelMeta( @@ -258,8 +341,10 @@ framework=["Sentence Transformers", "PyTorch"], use_instructions=False, public_training_code=None, + public_training_data=None, training_datasets=None, # source model in unknown # Not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, + adapted_from="cointegrated/rubert-tiny2", ) labse_ru_turbo = ModelMeta( @@ -276,9 +361,11 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch"], use_instructions=False, - training_datasets=None, # source model in unknown + training_datasets=None, # not MTEB: {"IlyaGusev/gazeta": ["train"], "zloelias/lenta-ru": ["train"]}, public_training_code=None, + adapted_from="cointegrated/LaBSE-en-ru", + public_training_data=None, ) @@ -305,8 +392,24 @@ embed_dim=1024, license="mit", similarity_fn_name="cosine", + adapted_from="ai-forever/ruRoberta-large", + training_datasets={ + # https://huggingface.co/ai-forever/ruRoberta-large + # https://huggingface.co/datasets/IlyaGusev/yandex_q_full + # https://huggingface.co/datasets/IlyaGusev/pikabu + # https://huggingface.co/datasets/IlyaGusev/ru_stackoverflow + # https://huggingface.co/datasets/IlyaGusev/habr + # https://huggingface.co/datasets/its5Q/habr_qna + # NewsCommentary + # MultiParaCrawl + "XNLI": [], + "XNLIV2": [], + "LanguageClassification": [], # XNLI + "MIRACLReranking": ["train"], + "MIRACLRetrieval": ["train"], + }, + public_training_data=None, public_training_code=None, - training_datasets=None, framework=["Sentence Transformers", "PyTorch"], citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb, title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index fd54871663..c5ba799338 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -6,6 +6,8 @@ from mteb.model_meta import ModelMeta from mteb.models.instruct_wrapper import instruct_wrapper +from .e5_instruct import E5_MISTRAL_TRAINING_DATA + def instruction_template( instruction: str, prompt_type: PromptType | None = None @@ -13,6 +15,19 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" +SFR_TRAINING_DATA = { # inherits from e5 + **E5_MISTRAL_TRAINING_DATA, + # From previously released blogpost which now have been taken down: + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "FEVER-PL": ["train"], # translation not trained on + "HotpotQA": ["train"], + "HotpotQAHardNegatives": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on +} + SFR_Embedding_2_R = ModelMeta( loader=partial( # type: ignore instruct_wrapper, @@ -41,16 +56,8 @@ def instruction_template( use_instructions=True, adapted_from="intfloat/e5-mistral-7b-instruct", public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, citation="""@misc{SFR-embedding-2, title={SFR-Embedding-2: Advanced Text Embedding with Multi-stage Training}, author={Rui Meng*, Ye Liu*, Shafiq Rayhan Joty, Caiming Xiong, Yingbo Zhou, Semih Yavuz}, @@ -86,14 +93,6 @@ def instruction_template( framework=["Sentence Transformers", "PyTorch"], use_instructions=True, public_training_code=None, - training_datasets={ # inherits from e5 - "MSMARCO": ["train"], - "MSMARCOHardNegatives": ["train"], - "NanoMSMARCORetrieval": ["train"], - "MSMARCO-PL": ["train"], # translation not trained on - "NQ": ["train"], - "NQHardNegatives": ["train"], - "NanoNQRetrieval": ["train"], - "NQ-PL": ["train"], # translation not trained on - }, + public_training_data=None, + training_datasets=SFR_TRAINING_DATA, ) diff --git a/mteb/models/sentence_transformers_models.py b/mteb/models/sentence_transformers_models.py index 63be6e925c..73dcf8a666 100644 --- a/mteb/models/sentence_transformers_models.py +++ b/mteb/models/sentence_transformers_models.py @@ -129,6 +129,7 @@ adapted_from=None, training_datasets=sent_trf_training_dataset, public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -151,6 +152,7 @@ training_datasets=sent_trf_training_dataset, public_training_code=None, citation=SBERT_CITATION, + public_training_data=None, ) paraphrase_multilingual_MiniLM_L12_v2 = ModelMeta( @@ -172,6 +174,7 @@ training_datasets=sent_trf_training_dataset, # assumed (probably some parallel as well) public_training_code=None, citation=SBERT_CITATION, + public_training_data=None, ) paraphrase_multilingual_mpnet_base_v2 = ModelMeta( @@ -204,6 +207,7 @@ # "yahoo-answers": yahoo_answers_train_dataset, # "stack-exchange": stack_exchange_train_dataset, public_training_code=None, + public_training_data=None, ) labse = ModelMeta( @@ -233,6 +237,7 @@ primaryClass={cs.CL}, url={https://arxiv.org/abs/2007.01852}, }""", + public_training_data=None, ) multi_qa_MiniLM_L6_cos_v1 = ModelMeta( @@ -253,6 +258,7 @@ adapted_from="nreimers/MiniLM-L6-H384-uncased", training_datasets=sent_trf_training_dataset, # assumed public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -272,39 +278,9 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # does sentence transformer count? - training_datasets={ - # source: frontmatter in readme - # trained on stack exchange, unsure if sources match - "StackExchangeClusteringP2P": ["test"], - "StackExchangeClusteringP2P.v2": ["test"], - "StackExchangeClustering": ["test"], - "StackExchangeClustering.v2": ["test"], - "NQ": ["test"], - "NQHardNegatives": ["test"], - "MSMARCO": ["train"], - # Non MTEB sources - # "s2orc": ["train"], - # "flax-sentence-embeddings/stackexchange_xml": ["train"], - # "ms_marco": ["train"], - # "gooaq": ["train"], - # "yahoo_answers_topics": ["train"], - # "code_search_net": ["train"], - # "search_qa": ["train"], - # "eli5": ["train"], - # "snli": ["train"], - # "multi_nli": ["train"], - # "wikihow": ["train"], - # "trivia_qa": ["train"], - # "embedding-data/sentence-compression": ["train"], - # "embedding-data/flickr30k-captions": ["train"], - # "embedding-data/altlex": ["train"], - # "embedding-data/simple-wiki": ["train"], - # "embedding-data/QQP": ["train"], - # "embedding-data/SPECTER": ["train"], - # "embedding-data/PAQ_pairs": ["train"], - # "embedding-data/WikiAnswers": ["train"], - }, + training_datasets=sent_trf_training_dataset, + public_training_code=None, + public_training_data=None, citation=SBERT_CITATION, ) @@ -337,6 +313,7 @@ doi = {10.48550/ARXIV.2112.09118}, }""", public_training_code=None, + public_training_data=None, training_datasets=None, ) @@ -368,4 +345,5 @@ # "sentence-transformers/natural-questions": ["train"], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 44aa1f8604..92d5db7c8a 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,7 +29,9 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, + # will be at https://github.com/NLPJCL/RAG-Retrieval public_training_code=None, + public_training_data=None, ) stella_en_1_5b = ModelMeta( @@ -54,8 +56,10 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", + # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, public_training_code=None, + public_training_data=None, ) stella_large_zh_v3_1792d = ModelMeta( @@ -75,6 +79,7 @@ superseded_by="dunzhang/stella-mrl-large-zh-v3.5-1792d", adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -99,6 +104,7 @@ superseded_by=None, adapted_from=None, public_training_code=None, + public_training_data=None, training_datasets={ # Not in MTEB: # - infgrad/dialogue_rewrite_llm @@ -124,6 +130,7 @@ superseded_by=None, adapted_from="dunzhang/stella-large-zh-v3-1792d", public_training_code=None, + public_training_data=None, training_datasets=None, # Not specified ) @@ -144,6 +151,7 @@ superseded_by=None, adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d", public_training_code=None, + public_training_data=None, training_datasets={ # It's a bit unclear what they have trained on to be honest, because they don't list all # And they also have some rather cryptic description of their training procedure, but at diff --git a/mteb/models/text2vec_models.py b/mteb/models/text2vec_models.py index 12322e69e9..86a9bcca4f 100644 --- a/mteb/models/text2vec_models.py +++ b/mteb/models/text2vec_models.py @@ -21,7 +21,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -46,7 +47,8 @@ use_instructions=False, superseded_by=None, adapted_from=None, - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB @@ -87,7 +89,8 @@ use_instructions=False, superseded_by=None, adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", - public_training_code=None, # Couldn't find it + public_training_code=None, + public_training_data=None, # Couldn't find it training_datasets={ # source: https://huggingface.co/shibing624/text2vec-base-chinese # Not in MTEB diff --git a/mteb/models/uae_models.py b/mteb/models/uae_models.py index e3cdaa8436..a12a936326 100644 --- a/mteb/models/uae_models.py +++ b/mteb/models/uae_models.py @@ -91,4 +91,5 @@ def encode( "SNLI": [], }, public_training_code=None, + public_training_data=None, ) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a98bc041bc..a637dee36a 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -12,6 +12,11 @@ from .wrapper import Wrapper +VOYAGE_TRAINING_DATA = { + # Self-reported (message from VoyageAI member) + # synthetic data +} + def token_limit(max_tpm: int, interval: int = 60): limit_interval_start_ts = time.time() @@ -156,8 +161,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_finance_2 = ModelMeta( @@ -179,8 +185,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_law_2 = ModelMeta( @@ -202,8 +209,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_code_2 = ModelMeta( @@ -225,8 +233,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_large_2 = ModelMeta( @@ -248,8 +257,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_2 = ModelMeta( @@ -271,8 +281,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_multilingual_2 = ModelMeta( name="voyageai/voyage-multilingual-2", @@ -293,8 +304,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3 = ModelMeta( @@ -316,8 +328,9 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, public_training_code=None, + public_training_data=None, ) voyage_3_lite = ModelMeta( @@ -339,6 +352,79 @@ def _batched_encode( similarity_fn_name="cosine", framework=["API"], use_instructions=True, - training_datasets=None, + training_datasets=VOYAGE_TRAINING_DATA, + public_training_code=None, + public_training_data=None, +) + + +voyage_3_exp = ModelMeta( + name="voyageai/voyage-3-m-exp", + revision="1", + release_date=None, # not released + languages=None, # supported languages not specified + loader=partial( + VoyageWrapper, + model_name="voyage-3-m-exp", + model_prompts=model_prompts, + ), + max_tokens=32000, + embed_dim=512, + open_weights=False, + n_parameters=None, + license=None, + reference="https://huggingface.co/voyageai/voyage-3-m-exp", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, + training_datasets={ + # MTEB(eng, classic) training data: + "ArguAna": ["train"], + "ArguAna-PL": ["train"], + "NanoArguAnaRetrieval": ["train"], + "HotpotQA": ["train"], + "HotpotQA-PL": ["train"], # translation not trained on + "HotpotQAHardNegatives": ["train"], + "MSMARCO": ["train"], + "MSMARCOHardNegatives": ["train"], + "NanoMSMARCORetrieval": ["train"], + "MSMARCO-PL": ["train"], # translation not trained on + "NQ": ["train"], + "NQHardNegatives": ["train"], + "NanoNQRetrieval": ["train"], + "NQ-PL": ["train"], # translation not trained on + "FEVER": ["train"], + "FEVERHardNegatives": ["train"], + "NanoFEVERRetrieval": ["train"], + "FiQA2018": ["train"], + "FiQA2018-PL": ["train"], # translation not trained on + "STS12": ["train"], + "STS22": ["train"], + "AmazonReviewsClassification": ["train"], + "AmazonCounterfactualClassification": ["train"], + "Banking77Classification": ["train"], + "EmotionClassification": ["train"], + "ImdbClassification": ["train"], + "MTOPIntentClassification": ["train"], + "ToxicConversationsClassification": ["train"], + "TweetSentimentExtractionClassification": ["train"], + "ArxivClusteringP2P": ["train"], + "ArxivClusteringP2P.v2": ["train"], + "ArxivClusteringS2S": ["train"], + "ArxivClusteringS2S.v2": ["train"], + "BiorxivClusteringP2P": ["train"], + "BiorxivClusteringP2P.v2": ["train"], + "BiorxivClusteringS2S": ["train"], + "BiorxivClusteringS2S.v2": ["train"], + "MedrxivClusteringP2P": ["train"], + "MedrxivClusteringP2P.v2": ["train"], + "MedrxivClusteringS2S": ["train"], + "MedrxivClusteringS2S.v2": ["train"], + "TwentyNewsgroupsClustering": ["train"], + "TwentyNewsgroupsClustering.v2": ["train"], + "STSBenchmark": ["train"], + "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on + }, public_training_code=None, + public_training_data=None, ) diff --git a/pyproject.toml b/pyproject.toml index 58c94a1979..f42014e3a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.10" +version = "1.29.16" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ diff --git a/scripts/generate_metadata.py b/scripts/generate_metadata.py index a192fa1341..4ae87fdbca 100644 --- a/scripts/generate_metadata.py +++ b/scripts/generate_metadata.py @@ -242,6 +242,7 @@ def model_meta_from_hf_hub(model_name: str) -> ModelMeta: license=None, open_weights=True, public_training_code=None, + public_training_data=None, similarity_fn_name=None, use_instructions=None, training_datasets=None, diff --git a/tests/test_tasks/test_mteb_rerank.py b/tests/test_tasks/test_mteb_rerank.py index 4a535bebbd..7705de4d3f 100644 --- a/tests/test_tasks/test_mteb_rerank.py +++ b/tests/test_tasks/test_mteb_rerank.py @@ -374,6 +374,7 @@ def test_reranker_same_ndcg1(tmp_path: Path): embed_dim=None, license=None, public_training_code=None, + public_training_data=None, reference=None, similarity_fn_name=None, use_instructions=None,