diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index e8d61d4e0b..37b654ac92 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -48,6 +48,8 @@ class Benchmark: citation: str | None = None contacts: list[str] | None = None display_on_leaderboard: bool = True + icon: str | None = None + display_name: str | None = None def __iter__(self): return iter(self.tasks) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index fd11c6d828..aa766c23b8 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -28,6 +28,8 @@ MTEB_EN = Benchmark( name="MTEB(eng, v2)", + display_name="English", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -97,6 +99,8 @@ MTEB_ENG_CLASSIC = Benchmark( name="MTEB(eng, v1)", + display_name="English Legacy", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -189,6 +193,8 @@ MTEB_MAIN_RU = Benchmark( name="MTEB(rus, v1)", + display_name="Russian", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg", tasks=get_tasks( languages=["rus"], tasks=[ @@ -239,6 +245,7 @@ MTEB_RETRIEVAL_WITH_INSTRUCTIONS = Benchmark( name="FollowIR", + display_name="Instruction Following", tasks=get_tasks( tasks=[ "Robust04InstructionRetrieval", @@ -260,6 +267,8 @@ MTEB_RETRIEVAL_LAW = Benchmark( name="MTEB(Law, v1)", # This benchmark is likely in the need of an update + display_name="Legal", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", tasks=get_tasks( tasks=[ "AILACasedocs", @@ -279,6 +288,8 @@ MTEB_RETRIEVAL_MEDICAL = Benchmark( name="MTEB(Medical, v1)", + display_name="Medical", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", tasks=get_tasks( tasks=[ "CUREv1", @@ -328,6 +339,8 @@ SEB = Benchmark( name="MTEB(Scandinavian, v1)", + display_name="Scandinavian", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg", tasks=get_tasks( tasks=[ # Bitext @@ -379,6 +392,7 @@ CoIR = Benchmark( name="CoIR", + display_name="Code Information Retrieval", tasks=get_tasks( tasks=[ "AppsRetrieval", @@ -408,6 +422,7 @@ RAR_b = Benchmark( name="RAR-b", + display_name="Reasoning retrieval", tasks=get_tasks( tasks=[ "ARCChallenge", @@ -442,6 +457,8 @@ MTEB_FRA = Benchmark( name="MTEB(fra, v1)", + display_name="French", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", tasks=MTEBTasks( get_tasks( languages=["fra"], @@ -496,6 +513,8 @@ MTEB_DEU = Benchmark( name="MTEB(deu, v1)", + display_name="German", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", tasks=get_tasks( languages=["deu"], exclusive_language_filter=True, @@ -543,6 +562,8 @@ MTEB_KOR = Benchmark( name="MTEB(kor, v1)", + display_name="Korean", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg", tasks=get_tasks( languages=["kor"], tasks=[ # @KennethEnevoldsen: We could probably expand this to a more solid benchamrk, but for now I have left it as is. @@ -565,6 +586,8 @@ MTEB_POL = Benchmark( name="MTEB(pol, v1)", + display_name="Polish", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg", tasks=MTEBTasks( get_tasks( languages=["pol"], @@ -610,6 +633,8 @@ MTEB_code = Benchmark( name="MTEB(Code, v1)", + display_name="Code", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", tasks=get_tasks( tasks=[ # Retrieval @@ -649,6 +674,8 @@ MTEB_multilingual = Benchmark( name="MTEB(Multilingual, v1)", + display_name="Multilingual", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -793,6 +820,8 @@ MTEB_JPN = Benchmark( name="MTEB(jpn, v1)", + display_name="Japanese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", tasks=get_tasks( languages=["jpn"], tasks=[ @@ -861,6 +890,8 @@ MTEB_INDIC = Benchmark( name="MTEB(Indic, v1)", + display_name="Indic", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -952,6 +983,8 @@ MTEB_EU = Benchmark( name="MTEB(Europe, v1)", + display_name="European", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg", tasks=get_tasks( tasks=[ "BornholmBitextMining", @@ -1040,6 +1073,7 @@ LONG_EMBED = Benchmark( name="LongEmbed", + display_name="Long-context Retrieval", tasks=get_tasks( tasks=[ "LEMBNarrativeQARetrieval", @@ -1130,7 +1164,6 @@ primaryClass={cs.SE}, url={https://arxiv.org/abs/2406.14497}, }""", - display_on_leaderboard=False, ) BEIR = Benchmark( @@ -1191,6 +1224,8 @@ C_MTEB = Benchmark( name="MTEB(cmn, v1)", + display_name="Chinese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg", tasks=MTEBTasks( get_tasks( tasks=[ @@ -1253,6 +1288,8 @@ FA_MTEB = Benchmark( name="MTEB(fas, beta)", + display_name="Farsi (BETA)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg", tasks=get_tasks( languages=["fas"], tasks=[ @@ -1333,6 +1370,8 @@ CHEMTEB = Benchmark( name="ChemTEB", + display_name="Chemical", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg", tasks=get_tasks( tasks=[ "PubChemSMILESBitextMining", @@ -1546,6 +1585,8 @@ MIEB_ENG = Benchmark( name="MIEB(eng)", + display_name="Images, English", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg", tasks=get_tasks( tasks=MIEB_common_tasks + [ @@ -1571,6 +1612,8 @@ MIEB_MULTILINGUAL = Benchmark( name="MIEB(Multilingual)", + display_name="Images, Multilingual", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg", tasks=get_tasks( tasks=MIEB_common_tasks + [ @@ -1602,6 +1645,8 @@ MIEB_LITE = Benchmark( name="MIEB(lite)", + display_name="Images, Lite", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg", tasks=get_tasks( tasks=[ # Image Classification diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 5506d40596..c8dd632b66 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -2,6 +2,9 @@ import gradio as gr +import mteb +from mteb import Benchmark + """ Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks. @@ -17,245 +20,156 @@ ] """ BENCHMARK_ENTRIES = [ - ( - "Multilingual", - dict( - value="MTEB(Multilingual, v1)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg", - ), - ), - ( - "English", - dict( - value="MTEB(eng, v2)", - icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", - ), - ), + mteb.get_benchmarks(["MTEB(Multilingual, v1)", "MTEB(eng, v2)"]), ( "Image Benchmarks", - [ - ( - "Images, Multilingual", - dict( - value="MIEB(Multilingual)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg", - ), - ), - ( - "Images, English", - dict( - value="MIEB(eng)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg", - ), - ), - ( - "Images, Lite", - dict( - value="MIEB(lite)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg", - ), - ), - ], + mteb.get_benchmarks( + [ + "MIEB(Multilingual)", + "MIEB(eng)", + "MIEB(lite)", + ] + ), ), ( "Domain-Specific Benchmarks", - [ - ( - "Code", - dict( - value="MTEB(Code, v1)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", - ), - ), - ( - "Legal", - dict( - value="MTEB(Law, v1)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", - ), - ), - ( - "Medical", - dict( - value="MTEB(Medical, v1)", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", - ), - ), - ( - "Chemical", - dict( - value="ChemTEB", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg", - ), - ), - ], + mteb.get_benchmarks( + [ + "MTEB(Code, v1)", + "MTEB(Law, v1)", + "MTEB(Medical, v1)", + "ChemTEB", + ] + ), ), ( "Regional Benchmarks", - [ - ( - "European", - dict( - value="MTEB(Europe, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg", - ), - ), - ( - "Indic", - dict( - value="MTEB(Indic, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg", - ), - ), - ( - "Scandinavian", - dict( - value="MTEB(Scandinavian, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg", - ), - ), - ], + mteb.get_benchmarks( + [ + "MTEB(Europe, v1)", + "MTEB(Indic, v1)", + "MTEB(Scandinavian, v1)", + ] + ), ), ( "Language-specific Benchmarks", - [ - ( - "Chinese", - dict( - value="MTEB(cmn, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg", - ), - ), - ( - "German", - dict( - value="MTEB(deu, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", - ), - ), - ( - "French", - dict( - value="MTEB(fra, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", - ), - ), - ( - "Japanese", - dict( - value="MTEB(jpn, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", - ), - ), - ( - "Korean", - dict( - value="MTEB(kor, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg", - ), - ), - ( - "Polish", - dict( - value="MTEB(pol, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg", - ), - ), - ( - "Russian", - dict( - value="MTEB(rus, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg", - ), - ), - ( - "Farsi (BETA)", - dict( - value="MTEB(fas, beta)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg", - ), - ), - ], + mteb.get_benchmarks( + [ + "MTEB(cmn, v1)", + "MTEB(deu, v1)", + "MTEB(fra, v1)", + "MTEB(jpn, v1)", + "MTEB(kor, v1)", + "MTEB(pol, v1)", + "MTEB(rus, v1)", + "MTEB(fas, beta)", + ] + ), ), ( "Miscellaneous", - [ - ("BEIR", dict(value="BEIR", icon=None)), - ("BEIR-NL", dict(value="BEIR-NL", icon=None)), - ("BRIGHT", dict(value="BRIGHT", icon=None)), - ("BRIGHT (long)", dict(value="BRIGHT (long)", icon=None)), - ("BuiltBench (eng)", dict(value="BuiltBench(eng)", icon=None)), - ("Code Information Retrieval", dict(value="CoIR", icon=None)), - ("Instruction Following", dict(value="FollowIR", icon=None)), - ("Long-context Retrieval", dict(value="LongEmbed", icon=None)), - ("MINERSBitextMining", dict(value="MINERSBitextMining", icon=None)), - ("NanoBEIR", dict(value="NanoBEIR", icon=None)), - ("Reasoning retrieval", dict(value="RAR-b", icon=None)), - ], + mteb.get_benchmarks( + [ + "BEIR", + "BEIR-NL", + "NanoBEIR", + "BRIGHT", + "BRIGHT (long)", + "BuiltBench(eng)", + "CoIR", + "FollowIR", + "LongEmbed", + "MINERSBitextMining", + "RAR-b", + ] + ), ), ( "Legacy", - [ - ( - "English Legacy", - dict( - value="MTEB(eng, v1)", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg", - ), - ), - ], + mteb.get_benchmarks( + [ + "MTEB(eng, v1)", + ] + ), ), ] -def _create_button(i, label, entry, state, label_to_value, **kwargs): - val = entry["value"] - label_to_value[label] = val +def _create_button( + i: int, + benchmark: Benchmark, + state: gr.State, + label_to_value: dict[str, str], + **kwargs, +): + val = benchmark.name + label = ( + benchmark.display_name if benchmark.display_name is not None else benchmark.name + ) + label_to_value[label] = benchmark.name button = gr.Button( label, variant="secondary" if i != 0 else "primary", - icon=entry["icon"], + icon=benchmark.icon, key=f"{i}_button_{val}", elem_classes="text-white", **kwargs, ) - def _update_variant(state, label) -> gr.Button: + def _update_variant(state: str, label: str) -> gr.Button: if state == label_to_value[label]: return gr.Button(variant="primary") else: return gr.Button(variant="secondary") - def _update_value(label) -> str: + def _update_value(label: str) -> str: return label_to_value[label] state.change(_update_variant, inputs=[state, button], outputs=[button]) - button.click(_update_value, outputs=[state], inputs=[button]) + button.click(_update_value, inputs=[button], outputs=[state]) return button -def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.Column]: +def make_selector( + entries: list[list[Benchmark] | tuple[str, list[Benchmark]]], +) -> tuple[gr.State, gr.Column]: if not entries: raise ValueError("No entries were specified, can't build selector.") label_to_value = {} state = None with gr.Column() as column: - for i, (label, entry) in enumerate(entries): + i = 0 + for entry in entries: if i == 0: - if isinstance(entry, dict): - state = gr.State(entry["value"]) + if isinstance(entry, list): + fist_entry = entry[0] + state = gr.State(fist_entry.name) + elif isinstance(entry, tuple): + _label, _entry = entry + state = gr.State(_entry[0].name) else: - _label, _entry = entry[0] - state = gr.State(_entry["value"]) - if isinstance(entry, dict): - button = _create_button( - i, label, entry, state, label_to_value, size="lg" - ) - else: + raise ValueError("Benchmark selector specified incorrectly") + if isinstance(entry, list): + for benchmark in entry: + button = _create_button( + i, benchmark, state, label_to_value, size="lg" + ) + i += 1 + elif isinstance(entry, tuple): + label, _entry = entry gr.Markdown(f"### **{label}**") - for sub_label, sub_entry in entry: + for benchmark in _entry: button = _create_button( # noqa: F841 - i, sub_label, sub_entry, state, label_to_value, size="md" + i, benchmark, state, label_to_value, size="md" ) + i += 1 return state, column + + +if __name__ == "__main__": + with gr.Blocks() as b: + selector = make_selector(BENCHMARK_ENTRIES) + + b.launch()