Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 235 additions & 0 deletions mteb/models/bge_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,215 @@
# "s2orc": [], # (title, abstract) (title, citation title) (abstract, citation abstract)
}

bgem3_training_data = {
# source https://arxiv.org/abs/2402.03216
"T2Retrieval": ["train"],
"DuReader": ["train"],
"MMarcoReranking": ["train"],
"CMedQAv2-reranking": ["train"],
"HotpotQA": ["train"],
"NQ": ["train"],
"MSMARCO": ["train"],
"MrTidyRetrieval": ["train"],
"MIRACLRetrieval": ["train"],
"CodeSearchNet": ["train"],
# not in mteb
# "s2orc"
# Wikipedia
# "xP3"
# "mC4"
# "CC-News"
# "MTP"
# "NLLB"
# "CCMatrix"
# TriviaQA
# COL-IEE
# PubMedQA
# SQuAD
# SimCSE
# mMARCO-ZH
# LawGPT
# NLI-zh2, LeCaRDv2,
# NLI, MultiLongDoc (their syntetic)
}

# https://huggingface.co/BAAI/bge-m3/discussions/29
bgem3_languages = [
"afr_Latn", # af
# als
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit unsure why these are commented out?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've taken these language codes from the discussion, but I can't find them in the language mapping or I'm not sure which ones they correspond to.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oooh okay ChatGPT usually does a remarkable job at matching these, there is also a Python library that can do this for you, wait a sec I'll find it

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Samoed It's called ISO639 and it feels like magic

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that we have not all langs in LANG_MAPPING

LANG_MAPPING = {

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm. Semes that LANG_MAPPING used only in MTEB class. I think this should be removed in v2

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm yea interesting

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find an easy way to get script from the language, so I'll leave it as is for now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, again, I think LLMs can be a good friend in doing that. If you have the name of the language you're also probably a google search away from the solution. And most languages use Latin, Arab or Cyrillic script anyway, so there are some sensible defaults to go with.

"amh_Ethi", # am
# an
# ar
"azj_Latn", # arz
# as
"ast_Latn", # ast
# av
# az
"azj_Latn", # azb
# ba
# bar
# bcl
"ben_Beng", # be
"bul_Cyrl", # bg
# bh
# bn
# bo
"bel_Cyrl", # bpy
# br
# bs
# bxr
"cat_Latn", # ca
# cbk
# ce
"ceb_Latn", # ceb
"ckb_Arab", # ckb
# co
# cs
# cv
# cy
"dan_Latn", # da
"deu_Latn", # de
# diq
# dsb
# dty
# dv
"ell_Grek", # el
# eml
"eng_Latn", # en
# eo
"est_Latn", # es
# et
# eu
# fa
"fin_Latn", # fi
"fra_Latn", # fr
# fy
# ga
# gd
"glg_Latn", # gl
# gn
# gom
"guj_Gujr", # gu
# gv
"heb_Hebr", # he
"hin_Deva", # hi
# hif
# hr
# hsb
# ht
# hu
# hy
# ia
# id
# ie
# ilo
# io
# is
"ita_Latn", # it
"jpn_Jpan", # ja
# jbo
# jv
# ka
# kk
# km
# kn
"kor_Hang", # ko
# krc
# ku
# kv
# kw
# ky
# la
# lb
# lez
# li
# lmo
# lo
# lt
# lv
# mai
# mg
# mhr
# min
# mk
# ml
# mn
# mr
# mrj
# ms
# mt
# mwl
# my
# myv
# mzn
# nah
# nap
# nds
# ne
# new
# nl
# nn
# no
# oc
# or
# os
# pa
# pam
# pfl
# pl
# pms
# pnb
# ps
# pt
# qu
# rm
# ro
"rus_Cyrl", # ru
# sa
# sah
# sc
# scn
# sco
# sd
# sh
# si
# sk
# sl
# so
# sq
# sr
# su
# sv
# sw
# ta
# te
# tg
"tha_Thai", # th
# tk
# tl
# tr
# tt
# tyv
# ug
"ukr_Cyrl", # uk
# ur
# uz
# vec
# vep
# vi
# vls
# vo
# wa
# war
# wuu
# xal
# xmf
# yi
# yo
# yue
"zho_Hans", # zh
]

bge_small_en_v1_5 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
Expand Down Expand Up @@ -329,6 +538,32 @@
training_datasets=bge_chinese_training_data,
)

bge_m3 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
model_name="BAAI/bge-m3",
revision="5617a9f61b028005a4858fdac845db406aefb181",
),
name="BAAI/bge-m3",
languages=bgem3_languages,
open_weights=True,
revision="5617a9f61b028005a4858fdac845db406aefb181",
release_date="2024-06-28",
n_parameters=568_000_000,
memory_usage=None,
embed_dim=4096,
license="mit",
max_tokens=8194,
reference="https://huggingface.co/BAAI/bge-m3",
similarity_fn_name="cosine",
framework=["Sentence Transformers", "PyTorch"],
use_instructions=False,
public_training_data=True,
public_training_code=None,
training_datasets=bgem3_training_data,
)


bge_multilingual_gemma2 = ModelMeta(
loader=partial( # type: ignore
sentence_transformers_loader,
Expand Down
Loading