Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions mteb/models/jina_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from __future__ import annotations

from functools import partial

from mteb.model_meta import ModelMeta

MODEL_NAME = "jinaai/jina-embeddings-v3"
REVISION = "fa78e35d523dcda8d3b5212c7487cf70a4b277da"
XLMR_LANGUAGES = [
"afr_Latn",
"amh_Latn",
"ara_Latn",
"asm_Latn",
"aze_Latn",
"bel_Latn",
"bul_Latn",
"ben_Latn",
"ben_Beng",
"bre_Latn",
"bos_Latn",
"cat_Latn",
"ces_Latn",
"cym_Latn",
"dan_Latn",
"deu_Latn",
"ell_Latn",
"eng_Latn",
"epo_Latn",
"spa_Latn",
"est_Latn",
"eus_Latn",
"fas_Latn",
"fin_Latn",
"fra_Latn",
"fry_Latn",
"gle_Latn",
"gla_Latn",
"glg_Latn",
"guj_Latn",
"hau_Latn",
"heb_Latn",
"hin_Latn",
"hin_Deva",
"hrv_Latn",
"hun_Latn",
"hye_Latn",
"ind_Latn",
"isl_Latn",
"ita_Latn",
"jpn_Latn",
"jav_Latn",
"kat_Latn",
"kaz_Latn",
"khm_Latn",
"kan_Latn",
"kor_Latn",
"kur_Latn",
"kir_Latn",
"lat_Latn",
"lao_Latn",
"lit_Latn",
"lav_Latn",
"mlg_Latn",
"mkd_Latn",
"mal_Latn",
"mon_Latn",
"mar_Latn",
"msa_Latn",
"mya_Latn",
"nep_Latn",
"nld_Latn",
"nob_Latn",
"orm_Latn",
"ori_Latn",
"pan_Latn",
"pol_Latn",
"pus_Latn",
"por_Latn",
"ron_Latn",
"rus_Latn",
"san_Latn",
"snd_Latn",
"sin_Latn",
"slk_Latn",
"slv_Latn",
"som_Latn",
"sqi_Latn",
"srp_Latn",
"sun_Latn",
"swe_Latn",
"swa_Latn",
"tam_Latn",
"tam_Taml",
"tel_Latn",
"tel_Telu",
"tha_Latn",
"tgl_Latn",
"tur_Latn",
"uig_Latn",
"ukr_Latn",
"urd_Latn",
"urd_Arab",
"uzb_Latn",
"vie_Latn",
"xho_Latn",
"yid_Latn",
"zho_Hant",
"zho_Hans",
]


model_prompts = {
"retrieval.query": "Represent the query for retrieving evidence documents: ",
"retrieval.passage": "Represent the document for retrieval: ",
"separation": "",
"classification": "",
"text-matching": "",
}

# Lora adaptation for specific downstream tasks.
# Empty represents for no-lora weights (or checkpoints after pair tuning).
supported_tasks = list(model_prompts.keys()) + [""]


def jina_embeddings_v3_loader(**kwargs):
class JinaV3Wrapper(Wrapper):
def encode(
self,
sentences: Sequence[str],
task: str,
*args,
**kwargs: Any,
) -> np.ndarray:
return super().encode(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm also working on this in #1319 too, and this implementation isn't quite right because MTEB will generate tasks that don't align with Jina tasks.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay okay @Samoed didn't aware you're working on it, i can imaging because now i'm still testing the implementation (marked as draft). What should we do now? shall i take over jina models?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw the reason i add here is because i want to give a full MMTEB evaluation of jina models

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I wanted too. I will provide my tests results in an hour

sentences, task=task, prompt_name=task, *args, **kwargs
)

return JinaV3Wrapper(**kwargs)


jina_embeddings_v3 = ModelMeta(
loader=partial(
jina_embeddings_v3_loader,
model_name=MODEL_NAME,
revision=REVISION,
model_prompts=model_prompts,
trust_remote_code=True,
),
name=MODEL_NAME,
languages=XLMR_LANGUAGES,
open_source=True, # CC-BY-NC-4.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in #1316 we update the metadata for all models:

It might be nice to add:

    max_tokens=...,
    embed_dim=...,
    n_parameters=...,
    memory_usage=...,
    license="cc-by-nc-4.0",
    reference=..., # you will need to pull from the PR to add this
    similarity_fn_name="cosine",
    framework=[...],
    use_instuctions=...,

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clear

revision=REVISION,
release_date="2024-09-18",
)
2 changes: 2 additions & 0 deletions mteb/models/overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
google_models,
gritlm_models,
gte_models,
jina_models,
llm2vec_models,
mxbai_models,
nomic_models,
Expand All @@ -38,6 +39,7 @@
google_models,
gritlm_models,
gte_models,
jina_models,
llm2vec_models,
mxbai_models,
nomic_models,
Expand Down