diff --git a/mteb/leaderboard/__init__.py b/mteb/leaderboard/__init__.py index 1dc3560a64..1db3fa2545 100644 --- a/mteb/leaderboard/__init__.py +++ b/mteb/leaderboard/__init__.py @@ -1,5 +1,5 @@ from __future__ import annotations -from mteb.leaderboard.app import demo +from mteb.leaderboard.app import get_leaderboard_app -__all__ = ["demo"] +__all__ = ["get_leaderboard_app"] diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 68ef6b56d4..e3833b5ce3 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -23,13 +23,6 @@ from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import create_tables -logging.getLogger("mteb.load_results.task_results").setLevel( - logging.WARNING -) # Warnings related to task split -logging.getLogger("mteb.models.overview").setLevel( - logging.WARNING -) # Warning related to model metadata (fetch_from_hf=False) -warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*") logger = logging.getLogger(__name__) acknowledgment_md = """ @@ -210,663 +203,688 @@ def filter_models( return list(models_to_keep) -logger.info("Loading all benchmark results") -all_results = load_results() - -benchmarks = sorted( - mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name -) -all_benchmark_results = { - benchmark.name: benchmark.load_results(base_results=all_results).join_revisions() - for benchmark in benchmarks -} -default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) -default_results = all_benchmark_results[default_benchmark.name] -logger.info("Benchmark results loaded") - -default_scores = default_results.get_scores(format="long") -all_models = list({entry["model_name"] for entry in default_scores}) -filtered_models = filter_models( - all_models, - default_results.task_names, - availability=None, - compatibility=[], - instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - zero_shot_setting="allow_all", -) - -summary_table, per_task_table = create_tables( - [entry for entry in default_scores if entry["model_name"] in filtered_models] -) - -benchmark_select = gr.Dropdown( - [bench.name for bench in benchmarks], - value=default_benchmark.name, - label="Prebuilt Benchmarks", - info="Select one of our expert-selected benchmarks from MTEB publications.", -) -lang_select = gr.Dropdown( - ISO_TO_LANGUAGE, - value=sorted(default_results.languages), - allow_custom_value=True, - multiselect=True, - label="Language", - info="Select languages to include.", -) -type_select = gr.Dropdown( - sorted(get_args(TASK_TYPE)), - value=sorted(default_results.task_types), - multiselect=True, - label="Task Type", - info="Select task types to include.", -) -domain_select = gr.Dropdown( - sorted(get_args(TASK_DOMAIN)), - value=sorted(default_results.domains), - multiselect=True, - label="Domain", - info="Select domains to include.", -) -task_select = gr.Dropdown( - sorted(all_results.task_names), - value=sorted(default_results.task_names), - allow_custom_value=True, - multiselect=True, - label="Task", - info="Select specific tasks to include", -) -modality_select = gr.Dropdown( - sorted(get_args(MODALITIES)), - value=sorted(default_results.modalities), - multiselect=True, - label="Modality", - info="Select modalities to include.", -) - -head = """ - -""" +def get_leaderboard_app() -> gr.Blocks: + logger.info("Loading all benchmark results") + all_results = load_results() -with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: - gr.Markdown( - """ - ## Embedding Leaderboard + benchmarks = sorted( + mteb.get_benchmarks(display_on_leaderboard=True), key=lambda x: x.name + ) + all_benchmark_results = { + benchmark.name: benchmark.load_results( + base_results=all_results + ).join_revisions() + for benchmark in benchmarks + } + default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) + default_results = all_benchmark_results[default_benchmark.name] + logger.info("Benchmark results loaded") + + default_scores = default_results.get_scores(format="long") + all_models = list({entry["model_name"] for entry in default_scores}) + filtered_models = filter_models( + all_models, + default_results.task_names, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot_setting="allow_all", + ) - This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ + summary_table, per_task_table = create_tables( + [entry for entry in default_scores if entry["model_name"] in filtered_models] + ) - > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. - """ + benchmark_select = gr.Dropdown( + [bench.name for bench in benchmarks], + value=default_benchmark.name, + label="Prebuilt Benchmarks", + info="Select one of our expert-selected benchmarks from MTEB publications.", + ) + lang_select = gr.Dropdown( + ISO_TO_LANGUAGE, + value=sorted(default_results.languages), + allow_custom_value=True, + multiselect=True, + label="Language", + info="Select languages to include.", + ) + type_select = gr.Dropdown( + sorted(get_args(TASK_TYPE)), + value=sorted(default_results.task_types), + multiselect=True, + label="Task Type", + info="Select task types to include.", + ) + domain_select = gr.Dropdown( + sorted(get_args(TASK_DOMAIN)), + value=sorted(default_results.domains), + multiselect=True, + label="Domain", + info="Select domains to include.", + ) + task_select = gr.Dropdown( + sorted(all_results.task_names), + value=sorted(default_results.task_names), + allow_custom_value=True, + multiselect=True, + label="Task", + info="Select specific tasks to include", + ) + modality_select = gr.Dropdown( + sorted(get_args(MODALITIES)), + value=sorted(default_results.modalities), + multiselect=True, + label="Modality", + info="Select modalities to include.", ) - with gr.Row(): - with gr.Column(scale=5): - gr.Markdown( - """ - ### Benchmarks - Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs. + head = """ + + """ + + with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + gr.Markdown( """ - ) - with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - with gr.Column(): - benchmark_select.render() - with gr.Accordion("Select Languages", open=False): - lang_select.render() - with gr.Accordion("Select Task Types", open=False): - type_select.render() - with gr.Accordion("Select Domains", open=False): - domain_select.render() - with gr.Accordion("Select Modalities", open=False): - modality_select.render() - with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - with gr.Column(scale=8): - gr.Markdown( - """ - ### Model Selection - Select models to rank based on an assortment of criteria. - """, - ) - with gr.Group(): - with gr.Row(): - searchbar = gr.Textbox( - label="Search Models", - info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "Sentence Transformers", - ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - with gr.Row(elem_classes=""): - with gr.Column(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", + ## Embedding Leaderboard + + This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ + + > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + """ + ) + + with gr.Row(): + with gr.Column(scale=5): + gr.Markdown( + "### Benchmarks\n" + "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs." + ) + with gr.Group(): + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + with gr.Column(): + benchmark_select.render() + with gr.Accordion("Select Languages", open=False): + lang_select.render() + with gr.Accordion("Select Task Types", open=False): + type_select.render() + with gr.Accordion("Select Domains", open=False): + domain_select.render() + with gr.Accordion("Select Modalities", open=False): + modality_select.render() + with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() + with gr.Column(scale=8): + gr.Markdown( + """ + ### Model Selection + Select models to rank based on an assortment of criteria. + """, + ) + with gr.Group(): + with gr.Row(): + searchbar = gr.Textbox( + label="Search Models", + info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", interactive=True, ) - with gr.Column(): - zero_shot = gr.Radio( + compatibility = gr.CheckboxGroup( [ ( - "Only Zero-shot", - "only_zero_shot", - ), - ("Remove Unknown", "remove_unknown"), - ("Allow All", "allow_all"), + "Should be sentence-transformers compatible", + "Sentence Transformers", + ) ], - value="allow_all", - label="Zero-shot", + value=[], + label="Compatibility", interactive=True, ) - model_size = RangeSlider( - minimum=MIN_MODEL_SIZE, - maximum=MAX_MODEL_SIZE, - value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - label="Model Size (#M Parameters)", - ) - scores = gr.State(default_scores) - models = gr.State(filtered_models) - with gr.Row(): - with gr.Column(): - description = gr.Markdown( - update_description, - inputs=[benchmark_select, lang_select, type_select, domain_select], + with gr.Row(elem_classes=""): + with gr.Column(): + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + with gr.Column(): + zero_shot = gr.Radio( + [ + ( + "Only Zero-shot", + "only_zero_shot", + ), + ("Remove Unknown", "remove_unknown"), + ("Allow All", "allow_all"), + ], + value="allow_all", + label="Zero-shot", + interactive=True, + ) + model_size = RangeSlider( + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + label="Model Size (#M Parameters)", + ) + scores = gr.State(default_scores) + models = gr.State(filtered_models) + with gr.Row(): + with gr.Column(): + description = gr.Markdown( # noqa: F841 + update_description, + inputs=[benchmark_select, lang_select, type_select, domain_select], + ) + citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 + with gr.Accordion("Share this benchmark:", open=False): + gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) + with gr.Column(): + with gr.Tab("Performance per Model Size"): + plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) + with gr.Tab("Performance per Task Type (Radar Chart)"): + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all task types in the benchmark*" + ) + with gr.Tab("Summary"): + summary_table.render() + download_summary = gr.DownloadButton("Download Table") + download_summary.click( + download_table, inputs=[summary_table], outputs=[download_summary] ) - citation = gr.Markdown(update_citation, inputs=[benchmark_select]) - with gr.Accordion("Share this benchmark:", open=False): - gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Column(): - with gr.Tab("Performance per Model Size"): - plot = gr.Plot(performance_size_plot, inputs=[summary_table]) + + with gr.Accordion( + "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", + open=False, + ): gr.Markdown( - "*We only display models that have been run on all tasks in the benchmark*" + """ + **Rank(borda)** is computed based on the [borda count](https://en.wikipedia.org/wiki/Borda_count), where each task is treated as a preference voter, which gives votes on the models per their relative performance on the task. The best model obtains the highest number of votes. The model with the highest number of votes across tasks obtains the highest rank. The Borda rank tends to prefer models that perform well broadly across tasks. However, given that it is a rank it can be unclear if the two models perform similarly. + + **Mean(Task)**: This is a naïve average computed across all the tasks within the benchmark. This score is simple to understand and is continuous as opposed to the Borda rank. However, the mean can overvalue tasks with higher variance in its scores. + + **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. + """ ) - with gr.Tab("Performance per Task Type (Radar Chart)"): - radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) + with gr.Accordion( + "What does zero-shot mean?", + open=False, + ): gr.Markdown( - "*We only display models that have been run on all task types in the benchmark*" + """ + A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. + The percentages in the table indicate what portion of the benchmark can be considered out-of-distribution for a given model. + 100% means the model has not been trained on any of the datasets in a given benchmark, and therefore the benchmark score can be interpreted as the model's overall generalization performance, + while 50% means the model has been finetuned on half of the tasks in the benchmark, thereby indicating that the benchmark results should be interpreted with a pinch of salt. + This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. + Distilled, further fine-tunes, or in other ways, derivative models inherit the datasets of their parent models. + Based on community feedback and research findings, this definition may change in the future. Please open a PR if you notice any mistakes or want to help us refine annotations, see [GitHub](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91). + """ ) - with gr.Tab("Summary"): - summary_table.render() - download_summary = gr.DownloadButton("Download Table") - download_summary.click( - download_table, inputs=[summary_table], outputs=[download_summary] - ) - - with gr.Accordion( - "What do aggregate measures (Rank(Borda), Mean(Task), etc.) mean?", - open=False, - ): - gr.Markdown( + with gr.Accordion( + "What do the other columns mean?", + open=False, + ): + gr.Markdown( + """ + - **Number of Parameters**: This is the total number of parameters in the model including embedding parameters. A higher value means the model requires more CPU/GPU memory to run; thus, less is generally desirable. + - **Embedding Dimension**: This is the vector dimension of the embeddings that the model produces. When saving embeddings to disk, a higher dimension will require more space, thus less is usually desirable. + - **Max tokens**: This refers to how many tokens (=word pieces) the model can process. Generally, a larger value is desirable. + - **Zero-shot**: This indicates if the model is zero-shot on the benchmark. For more information on zero-shot see the info box above. """ - **Rank(borda)** is computed based on the [borda count](https://en.wikipedia.org/wiki/Borda_count), where each task is treated as a preference voter, which gives votes on the models per their relative performance on the task. The best model obtains the highest number of votes. The model with the highest number of votes across tasks obtains the highest rank. The Borda rank tends to prefer models that perform well broadly across tasks. However, given that it is a rank it can be unclear if the two models perform similarly. - - **Mean(Task)**: This is a naïve average computed across all the tasks within the benchmark. This score is simple to understand and is continuous as opposed to the Borda rank. However, the mean can overvalue tasks with higher variance in its scores. - - **Mean(TaskType)**: This is a weighted average across different task categories, such as classification or retrieval. It is computed by first computing the average by task category and then computing the average on each category. Similar to the Mean(Task) this measure is continuous and tends to overvalue tasks with higher variance. This score also prefers models that perform well across all task categories. - """ - ) - with gr.Accordion( - "What does zero-shot mean?", - open=False, - ): - gr.Markdown( + ) + with gr.Accordion( + "Why is a model missing or not showing up?", + open=False, + ): + gr.Markdown( + """ + Possible reasons why a model may not show up in the leaderboard: + + - **Filter Setting**: It is being filtered out with your current filter. By default, we do not show models that are not zero-shot on the benchmark. + You can change this setting in the model selection panel. + - **Missing Results**: The model may not have been run on the tasks in the benchmark. We only display models that have been run on at least one task + in the benchmark. For visualizations that require the mean across all tasks, we only display models that have been run on all tasks in the benchmark. + You can see existing results in the [results repository](https://github.com/embeddings-benchmark/results). This is also where new results are added via PR. + - **Missing Metadata**: Currently, we only show models for which we have metadata in [mteb](https://github.com/embeddings-benchmark/mteb). + You can follow this guide on how to add a [model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) and + see existing implementations [here](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/models). """ -A model is considered zero-shot if it is not trained on any splits of the datasets used to derive the tasks. -The percentages in the table indicate what portion of the benchmark can be considered out-of-distribution for a given model. -100% means the model has not been trained on any of the datasets in a given benchmark, and therefore the benchmark score can be interpreted as the model's overall generalization performance, -while 50% means the model has been finetuned on half of the tasks in the benchmark, thereby indicating that the benchmark results should be interpreted with a pinch of salt. -This definition creates a few edge cases. For instance, multiple models are typically trained on Wikipedia title and body pairs, but we do not define this as leakage on, e.g., “WikipediaRetrievalMultilingual” and “WikiClusteringP2P” as these datasets are not based on title-body pairs. -Distilled, further fine-tunes, or in other ways, derivative models inherit the datasets of their parent models. -Based on community feedback and research findings, this definition may change in the future. Please open a PR if you notice any mistakes or want to help us refine annotations, see [GitHub](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91). - """ + ) + with gr.Tab("Performance per task"): + per_task_table.render() + download_per_task = gr.DownloadButton("Download Table") + download_per_task.click( + download_table, inputs=[per_task_table], outputs=[download_per_task] ) - with gr.Accordion( - "What do the other columns mean?", - open=False, - ): - gr.Markdown( - """ -- **Number of Parameters**: This is the total number of parameters in the model including embedding parameters. A higher value means the model requires more CPU/GPU memory to run; thus, less is generally desirable. -- **Embedding Dimension**: This is the vector dimension of the embeddings that the model produces. When saving embeddings to disk, a higher dimension will require more space, thus less is usually desirable. -- **Max tokens**: This refers to how many tokens (=word pieces) the model can process. Generally, a larger value is desirable. -- **Zero-shot**: This indicates if the model is zero-shot on the benchmark. For more information on zero-shot see the info box above. - """ + with gr.Tab("Task information"): + task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) # noqa: F841 + + # This sets the benchmark from the URL query parameters + demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) + + @cachetools.cached( + cache={}, + key=lambda benchmark_name: hash(benchmark_name), + ) + def on_benchmark_select(benchmark_name): + start_time = time.time() + benchmark = mteb.get_benchmark(benchmark_name) + languages = [task.languages for task in benchmark.tasks if task.languages] + languages = set(itertools.chain.from_iterable(languages)) + languages = sorted(languages) + domains = [ + task.metadata.domains + for task in benchmark.tasks + if task.metadata.domains + ] + domains = set(itertools.chain.from_iterable(domains)) + types = { + task.metadata.type for task in benchmark.tasks if task.metadata.type + } + modalities = set() + for task in benchmark.tasks: + modalities.update(task.metadata.modalities) + languages, domains, types, modalities = ( + sorted(languages), + sorted(domains), + sorted(types), + sorted(modalities), ) - with gr.Accordion( - "Why is a model missing or not showing up?", - open=False, - ): - gr.Markdown( - """ -Possible reasons why a model may not show up in the leaderboard: - -- **Filter Setting**: It is being filtered out with your current filter. By default, we do not show models that are not zero-shot on the benchmark. -You can change this setting in the model selection panel. -- **Missing Results**: The model may not have been run on the tasks in the benchmark. We only display models that have been run on at least one task -in the benchmark. For visualizations that require the mean across all tasks, we only display models that have been run on all tasks in the benchmark. -You can see existing results in the [results repository](https://github.com/embeddings-benchmark/results). This is also where new results are added via PR. -- **Missing Metadata**: Currently, we only show models for which we have metadata in [mteb](https://github.com/embeddings-benchmark/mteb). -You can follow this guide on how to add a [model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) and -see existing implementations [here](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/models). - """ + elapsed = time.time() - start_time + benchmark_results = all_benchmark_results[benchmark_name] + scores = benchmark_results.get_scores(format="long") + logger.info(f"on_benchmark_select callback: {elapsed}s") + return ( + languages, + domains, + types, + modalities, + sorted([task.metadata.name for task in benchmark.tasks]), + scores, ) - with gr.Tab("Performance per task"): - per_task_table.render() - download_per_task = gr.DownloadButton("Download Table") - download_per_task.click( - download_table, inputs=[per_task_table], outputs=[download_per_task] - ) - with gr.Tab("Task information"): - task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) - # This sets the benchmark from the URL query parameters - demo.load(set_benchmark_on_load, inputs=[], outputs=[benchmark_select]) + benchmark_select.change( + on_benchmark_select, + inputs=[benchmark_select], + outputs=[ + lang_select, + domain_select, + type_select, + modality_select, + task_select, + scores, + ], + ) - @cachetools.cached( - cache={}, - key=lambda benchmark_name: hash(benchmark_name), - ) - def on_benchmark_select(benchmark_name): - start_time = time.time() - benchmark = mteb.get_benchmark(benchmark_name) - languages = [task.languages for task in benchmark.tasks if task.languages] - languages = set(itertools.chain.from_iterable(languages)) - languages = sorted(languages) - domains = [ - task.metadata.domains for task in benchmark.tasks if task.metadata.domains - ] - domains = set(itertools.chain.from_iterable(domains)) - types = {task.metadata.type for task in benchmark.tasks if task.metadata.type} - modalities = set() - for task in benchmark.tasks: - modalities.update(task.metadata.modalities) - languages, domains, types, modalities = ( - sorted(languages), - sorted(domains), - sorted(types), - sorted(modalities), + @cachetools.cached( + cache={}, + key=lambda benchmark_name, languages: hash( + (hash(benchmark_name), hash(tuple(languages))) + ), ) - elapsed = time.time() - start_time - benchmark_results = all_benchmark_results[benchmark_name] - scores = benchmark_results.get_scores(format="long") - logger.info(f"on_benchmark_select callback: {elapsed}s") - return ( - languages, - domains, - types, - modalities, - sorted([task.metadata.name for task in benchmark.tasks]), - scores, + def update_scores_on_lang_change(benchmark_name, languages): + start_time = time.time() + benchmark_results = all_benchmark_results[benchmark_name] + scores = benchmark_results.get_scores(languages=languages, format="long") + elapsed = time.time() - start_time + logger.info(f"update_scores callback: {elapsed}s") + return scores + + lang_select.input( + update_scores_on_lang_change, + inputs=[benchmark_select, lang_select], + outputs=[scores], ) - benchmark_select.change( - on_benchmark_select, - inputs=[benchmark_select], - outputs=[ - lang_select, - domain_select, - type_select, - modality_select, - task_select, - scores, - ], - ) - - @cachetools.cached( - cache={}, - key=lambda benchmark_name, languages: hash( - (hash(benchmark_name), hash(tuple(languages))) - ), - ) - def update_scores_on_lang_change(benchmark_name, languages): - start_time = time.time() - benchmark_results = all_benchmark_results[benchmark_name] - scores = benchmark_results.get_scores(languages=languages, format="long") - elapsed = time.time() - start_time - logger.info(f"update_scores callback: {elapsed}s") - return scores - - lang_select.input( - update_scores_on_lang_change, - inputs=[benchmark_select, lang_select], - outputs=[scores], - ) - - @cachetools.cached( - cache={}, - key=lambda benchmark_name, - type_select, - domain_select, - lang_select, - modality_select: hash( - ( - hash(benchmark_name), - hash(tuple(type_select)), - hash(tuple(domain_select)), - hash(tuple(lang_select)), - hash(tuple(modality_select)), - ) - ), - ) - def update_task_list( - benchmark_name, type_select, domain_select, lang_select, modality_select - ): - start_time = time.time() - tasks_to_keep = [] - for task in mteb.get_benchmark(benchmark_name).tasks: - if task.metadata.type not in type_select: - continue - if not (set(task.metadata.domains or []) & set(domain_select)): - continue - if not (set(task.languages or []) & set(lang_select)): - continue - if not (set(task.metadata.modalities or []) & set(modality_select)): - continue - tasks_to_keep.append(task.metadata.name) - elapsed = time.time() - start_time - logger.info(f"update_task_list callback: {elapsed}s") - return sorted(tasks_to_keep) - - type_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) - domain_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) - lang_select.input( - update_task_list, - inputs=[ - benchmark_select, - type_select, - domain_select, - lang_select, - modality_select, - ], - outputs=[task_select], - ) - modality_select.input( - update_task_list, - inputs=[ - benchmark_select, + @cachetools.cached( + cache={}, + key=lambda benchmark_name, type_select, domain_select, lang_select, - modality_select, - ], - outputs=[task_select], - ) + modality_select: hash( + ( + hash(benchmark_name), + hash(tuple(type_select)), + hash(tuple(domain_select)), + hash(tuple(lang_select)), + hash(tuple(modality_select)), + ) + ), + ) + def update_task_list( + benchmark_name, type_select, domain_select, lang_select, modality_select + ): + start_time = time.time() + tasks_to_keep = [] + for task in mteb.get_benchmark(benchmark_name).tasks: + if task.metadata.type not in type_select: + continue + if not (set(task.metadata.domains or []) & set(domain_select)): + continue + if not (set(task.languages or []) & set(lang_select)): + continue + if not (set(task.metadata.modalities or []) & set(modality_select)): + continue + tasks_to_keep.append(task.metadata.name) + elapsed = time.time() - start_time + logger.info(f"update_task_list callback: {elapsed}s") + return sorted(tasks_to_keep) + + type_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + domain_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + lang_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) + modality_select.input( + update_task_list, + inputs=[ + benchmark_select, + type_select, + domain_select, + lang_select, + modality_select, + ], + outputs=[task_select], + ) - @cachetools.cached( - cache={}, - key=lambda scores, - tasks, - availability, - compatibility, - instructions, - model_size, - zero_shot: hash( - ( - id(scores), - hash(tuple(tasks)), - hash(availability), - hash(tuple(compatibility)), - hash(instructions), - hash(model_size), - hash(zero_shot), - ) - ), - ) - def update_models( - scores: list[dict], - tasks: list[str], - availability: bool | None, - compatibility: list[str], - instructions: bool | None, - model_size: tuple[int, int], - zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"], - ): - start_time = time.time() - model_names = list({entry["model_name"] for entry in scores}) - filtered_models = filter_models( - model_names, + @cachetools.cached( + cache={}, + key=lambda scores, tasks, availability, compatibility, instructions, model_size, - zero_shot_setting=zero_shot, + zero_shot: hash( + ( + id(scores), + hash(tuple(tasks)), + hash(availability), + hash(tuple(compatibility)), + hash(instructions), + hash(model_size), + hash(zero_shot), + ) + ), ) - elapsed = time.time() - start_time - if model_names == filtered_models: - # This indicates that the models should not be filtered - return None - logger.info(f"update_models callback: {elapsed}s") - return sorted(filtered_models) - - scores.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - task_select.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - availability.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - compatibility.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - instructions.input( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - model_size.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - zero_shot.change( - update_models, - inputs=[ - scores, - task_select, - availability, - compatibility, - instructions, - model_size, - zero_shot, - ], - outputs=[models], - ) - - @cachetools.cached( - cache={}, - key=lambda scores, search_query, tasks, models_to_keep, benchmark_name: hash( - ( - id(scores), - hash(search_query), - hash(tuple(tasks)), - id(models_to_keep), - hash(benchmark_name), + def update_models( + scores: list[dict], + tasks: list[str], + availability: bool | None, + compatibility: list[str], + instructions: bool | None, + model_size: tuple[int, int], + zero_shot: Literal["allow_all", "remove_unknown", "only_zero_shot"], + ): + start_time = time.time() + model_names = list({entry["model_name"] for entry in scores}) + filtered_models = filter_models( + model_names, + tasks, + availability, + compatibility, + instructions, + model_size, + zero_shot_setting=zero_shot, ) - ), - ) - def update_tables( - scores, - search_query: str, - tasks, - models_to_keep, - benchmark_name: str, - ): - start_time = time.time() - tasks = set(tasks) - benchmark = mteb.get_benchmark(benchmark_name) - benchmark_tasks = {task.metadata.name for task in benchmark.tasks} - if (benchmark_tasks != tasks) or (models_to_keep is not None): - filtered_scores = [] - for entry in scores: - if entry["task_name"] not in tasks: - continue - if (models_to_keep is not None) and ( - entry["model_name"] not in models_to_keep - ): - continue - filtered_scores.append(entry) - else: - filtered_scores = scores - summary, per_task = create_tables(filtered_scores, search_query) - elapsed = time.time() - start_time - logger.info(f"update_tables callback: {elapsed}s") - return summary, per_task - - task_select.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - scores.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - models.change( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - searchbar.submit( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) + elapsed = time.time() - start_time + if model_names == filtered_models: + # This indicates that the models should not be filtered + return None + logger.info(f"update_models callback: {elapsed}s") + return sorted(filtered_models) + + scores.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + task_select.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + availability.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + compatibility.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + instructions.input( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + model_size.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) + zero_shot.change( + update_models, + inputs=[ + scores, + task_select, + availability, + compatibility, + instructions, + model_size, + zero_shot, + ], + outputs=[models], + ) - gr.Markdown(acknowledgment_md, elem_id="ack_markdown") - - -# Prerun on all benchmarks, so that results of callbacks get cached -for benchmark in benchmarks: - ( - bench_languages, - bench_domains, - bench_types, - bench_modalities, - bench_tasks, - bench_scores, - ) = on_benchmark_select(benchmark.name) - filtered_models = update_models( - bench_scores, - bench_tasks, - availability=None, - compatibility=[], - instructions=None, - model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - zero_shot="allow_all", - ) - # We have to call this both on the filtered and unfiltered task because the callbacks - # also gets called twice for some reason - update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) - filtered_tasks = update_task_list( - benchmark.name, bench_types, bench_domains, bench_languages, bench_modalities - ) - update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + @cachetools.cached( + cache={}, + key=lambda scores, + search_query, + tasks, + models_to_keep, + benchmark_name: hash( + ( + id(scores), + hash(search_query), + hash(tuple(tasks)), + id(models_to_keep), + hash(benchmark_name), + ) + ), + ) + def update_tables( + scores, + search_query: str, + tasks, + models_to_keep, + benchmark_name: str, + ): + start_time = time.time() + tasks = set(tasks) + benchmark = mteb.get_benchmark(benchmark_name) + benchmark_tasks = {task.metadata.name for task in benchmark.tasks} + if (benchmark_tasks != tasks) or (models_to_keep is not None): + filtered_scores = [] + for entry in scores: + if entry["task_name"] not in tasks: + continue + if (models_to_keep is not None) and ( + entry["model_name"] not in models_to_keep + ): + continue + filtered_scores.append(entry) + else: + filtered_scores = scores + summary, per_task = create_tables(filtered_scores, search_query) + elapsed = time.time() - start_time + logger.info(f"update_tables callback: {elapsed}s") + return summary, per_task + + task_select.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + scores.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + models.change( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + searchbar.submit( + update_tables, + inputs=[scores, searchbar, task_select, models, benchmark_select], + outputs=[summary_table, per_task_table], + ) + + gr.Markdown(acknowledgment_md, elem_id="ack_markdown") + + # Prerun on all benchmarks, so that results of callbacks get cached + for benchmark in benchmarks: + ( + bench_languages, + bench_domains, + bench_types, + bench_modalities, + bench_tasks, + bench_scores, + ) = on_benchmark_select(benchmark.name) + filtered_models = update_models( + bench_scores, + bench_tasks, + availability=None, + compatibility=[], + instructions=None, + model_size=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + zero_shot="allow_all", + ) + # We have to call this both on the filtered and unfiltered task because the callbacks + # also gets called twice for some reason + update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) + filtered_tasks = update_task_list( + benchmark.name, + bench_types, + bench_domains, + bench_languages, + bench_modalities, + ) + update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + return demo if __name__ == "__main__": - demo.launch(share=True) + logging.getLogger("mteb.load_results.task_results").setLevel( + logging.ERROR + ) # Warnings related to task split + logging.getLogger("mteb.model_meta").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + logging.getLogger("mteb.load_results.benchmark_results").setLevel( + logging.ERROR + ) # Warning related to model metadata (fetch_from_hf=False) + warnings.filterwarnings("ignore", message="Couldn't get scores for .* due to .*") + + app = get_leaderboard_app() + app.launch(server_name="0.0.0.0", server_port=7860) diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py index 61abccb9da..4fde7c4f49 100644 --- a/mteb/models/cache_wrapper.py +++ b/mteb/models/cache_wrapper.py @@ -12,9 +12,6 @@ from mteb.encoder_interface import Encoder from mteb.models.wrapper import Wrapper -logging.basicConfig( - level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" -) logger = logging.getLogger(__name__) diff --git a/mteb/models/gme_v_models.py b/mteb/models/gme_v_models.py index 19f6e4714a..8d83b54a33 100644 --- a/mteb/models/gme_v_models.py +++ b/mteb/models/gme_v_models.py @@ -16,12 +16,8 @@ from mteb.model_meta import ModelMeta from mteb.models.wrapper import Wrapper -logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) -HF_GME_QWEN2VL_2B = "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct" -HF_GME_QWEN2VL_7B = "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct" - class Encoder(torch.nn.Module): def __init__( @@ -133,7 +129,7 @@ def embed( class GmeQwen2VL(Wrapper): def __init__( self, - model_name: str = HF_GME_QWEN2VL_2B, + model_name: str, model_path: str | None = None, device: str = "cuda" if torch.cuda.is_available() else "cpu", min_image_tokens=4, @@ -413,9 +409,9 @@ def fetch_image( gme_qwen2vl_2b = ModelMeta( loader=partial( GmeQwen2VL, - model_name=HF_GME_QWEN2VL_2B, + model_name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", ), - name=HF_GME_QWEN2VL_2B, + name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", languages=["eng_Latn", "cmn-Hans"], open_weights=True, revision="ce765ae71b8cdb208203cd8fb64a170b1b84293a", @@ -426,7 +422,7 @@ def fetch_image( embed_dim=1536, license="apache-2.0", max_tokens=32768, - reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B, + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct", similarity_fn_name="cosine", framework=["PyTorch"], use_instructions=True, @@ -438,9 +434,9 @@ def fetch_image( gme_qwen2vl_7b = ModelMeta( loader=partial( GmeQwen2VL, - model_name=HF_GME_QWEN2VL_7B, + model_name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", ), - name=HF_GME_QWEN2VL_7B, + name="Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", languages=["eng_Latn", "cmn-Hans"], open_weights=True, revision="477027a6480f8630363be77751f169cc3434b673", @@ -451,7 +447,7 @@ def fetch_image( embed_dim=3584, license="apache-2.0", max_tokens=32768, - reference="https://huggingface.co/" + HF_GME_QWEN2VL_2B, + reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct", similarity_fn_name="cosine", framework=["PyTorch"], use_instructions=True, diff --git a/mteb/models/vlm2vec_models.py b/mteb/models/vlm2vec_models.py index a630a57d2f..70cc51cd28 100644 --- a/mteb/models/vlm2vec_models.py +++ b/mteb/models/vlm2vec_models.py @@ -18,7 +18,6 @@ suggest_package, ) -logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) EncodeTypes = Literal["query", "passage"]