diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 07aad5be5b57..787aa2592aa9 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -632,6 +632,8 @@
         title: Jamba
       - local: model_doc/jetmoe
         title: JetMoe
+      - local: model_doc/jina_embeddings_v3
+        title: jina_embeddings_v3
       - local: model_doc/led
         title: LED
       - local: model_doc/lfm2
diff --git a/docs/source/en/model_doc/jina_embeddings_v3.md b/docs/source/en/model_doc/jina_embeddings_v3.md
new file mode 100644
index 000000000000..e30af7e50ea9
--- /dev/null
+++ b/docs/source/en/model_doc/jina_embeddings_v3.md
@@ -0,0 +1,165 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️  Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+*This model was released on 2024-09-16 and added to Hugging Face Transformers on 2026-03-18.*
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+
+# JinaEmbeddingsV3
+
+The [Jina-Embeddings-v3](https://huggingface.co/papers/2409.10173) is a multilingual, multi-task text embedding model designed for a variety of NLP applications. Based on the XLM-RoBERTa architecture, this model supports **Rotary Position Embeddings (RoPE)** replacing absolute position embeddings to support long input sequences up to 8192 tokens. Additionally, it features 5 built-in **Task-Specific LoRA Adapters:** that allow the model to generate task-specific embeddings (e.g., for retrieval vs. classification) without increasing inference latency significantly.
+
+
+You can find the original Jina Embeddings v3 checkpoints under the [Jina AI](https://huggingface.co/jinaai) organization.
+
+
+> [!TIP]
+> Click on the Jina Embeddings v3 models in the right sidebar for more examples of how to apply the model to different language tasks.
+
+The example below demonstrates how to extract features (embeddings) with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="feature-extraction",
+    model="jinaai/jina-embeddings-v3-hf",
+)
+# Returns a list of lists containing the embeddings for each token
+embeddings = pipeline("Jina Embeddings V3 is great for semantic search.")
+```
+
+
+</hfoption>
+<hfoption id="AutoModel">
+
+
+```py
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3-hf")
+model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3-hf", device_map="auto")
+
+prompt = "Jina Embeddings V3 is great for semantic search."
+inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    # The base AutoModel returns the raw hidden states for all tokens
+    last_hidden_states = outputs.last_hidden_state
+
+print(f"Features shape: {last_hidden_states.shape}")
+```
+
+</hfoption>
+</hfoptions>
+
+## Task-Specific LoRA Adapters
+
+A key feature of `JinaEmbeddingsV3` is it's LoRA adapters, which allow you to tailor the output embeddings to specific useful use cases without the overhead of loading entirely different models.
+
+The following tasks are supported:
+
+* **`retrieval.query`**: Used for query embeddings in asymmetric retrieval tasks (e.g., search queries).
+* **`retrieval.passage`**: Used for passage embeddings in asymmetric retrieval tasks (e.g., the documents being searched).
+* **`separation`**: Used for embeddings in clustering and re-ranking applications.
+* **`classification`**: Used for embeddings in classification tasks.
+* **`text-matching`**: Used for embeddings in tasks that quantify similarity between two texts, such as Semantic Textual Similarity (STS) or symmetric retrieval tasks.
+
+
+To generate high-quality sentence or paragraph embeddings, you need to apply **mean pooling** to the model's token embeddings. Mean pooling takes all token embeddings from the model's output and averages them, masking out the padding tokens.
+
+Here is how you can generate sentence embeddings tailored for a retrieval query task using the `AutoModel` API.
+
+```python
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+
+def mean_pooling(model_output, attention_mask):
+    # First element of model_output contains all token embeddings
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+
+    # Sum the embeddings and divide by the number of non-padding tokens
+    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    return sum_embeddings / sum_mask
+
+
+sentences = [
+    "How is the weather today?", 
+    "What is the current weather like today?"
+]
+
+tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3-hf")
+model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3-hf")
+
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt").to(model.device)
+
+# Set up the adapter mask for your specific task
+task = 'retrieval_query'  # Can be any of (retrieval_passage, separation, classification, text_matching) depending on the use-case.
+
+model.load_adapter("jinaai/jina-embeddings-v3-hf", adapter_name=task, adapter_kwargs={"subfolder": task})
+
+model.set_adapter(task)
+
+with torch.no_grad():
+    model_output = model(**encoded_input)
+
+embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
+embeddings = F.normalize(embeddings, p=2, dim=1)
+
+print(embeddings.shape)
+# Output: torch.Size([2, 1024])
+```
+
+
+## JinaEmbeddingsV3Config 
+
+[[autodoc]] JinaEmbeddingsV3Config
+
+## JinaEmbeddingsV3Model
+
+[[autodoc]] JinaEmbeddingsV3Model
+    - forward
+
+## JinaEmbeddingsV3ForMaskedLM 
+
+[[autodoc]] JinaEmbeddingsV3ForMaskedLM
+    - forward
+
+## JinaEmbeddingsV3ForSequenceClassification
+
+[[autodoc]] JinaEmbeddingsV3ForSequenceClassification
+    - forward
+
+## JinaEmbeddingsV3ForTokenClassification
+
+[[autodoc]] JinaEmbeddingsV3ForTokenClassification
+    - forward
+
+## JinaEmbeddingsV3ForQuestionAnswering
+
+[[autodoc]] JinaEmbeddingsV3ForQuestionAnswering
+    - forward
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
index 22d9ddcbcd03..c7452fc89d20 100755
--- a/src/transformers/conversion_mapping.py
+++ b/src/transformers/conversion_mapping.py
@@ -420,6 +420,22 @@ def _build_checkpoint_conversion_mapping():
                 target_patterns="LayerNorm.bias",
             ),
         ],
+        "jina_embeddings_v3": [
+            WeightRenaming(source_patterns="emb_ln", target_patterns="embeddings.LayerNorm"),
+            WeightRenaming(source_patterns="encoder.layers", target_patterns="layers"),
+            WeightConverter(
+                source_patterns="mixer.Wqkv",
+                target_patterns=[
+                    "self_attn.q_proj",
+                    "self_attn.k_proj",
+                    "self_attn.v_proj",
+                ],
+                operations=[Chunk(dim=0)],
+            ),
+            WeightRenaming(source_patterns="mixer.out_proj", target_patterns="self_attn.o_proj"),
+            WeightRenaming(source_patterns="norm1", target_patterns="post_attention_layernorm"),
+            WeightRenaming(source_patterns="norm2", target_patterns="post_mlp_layernorm"),
+        ],
     }
     mapping["legacy"] += [
         WeightRenaming(
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 5f45081ac4a0..860a1bac23cf 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -203,6 +203,7 @@
     from .jamba import *
     from .janus import *
     from .jetmoe import *
+    from .jina_embeddings_v3 import *
     from .kosmos2 import *
     from .kosmos2_5 import *
     from .kyutai_speech_to_text import *
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 476b5362343f..3893ca8838c9 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -237,6 +237,7 @@
         ("jamba", "JambaConfig"),
         ("janus", "JanusConfig"),
         ("jetmoe", "JetMoeConfig"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3Config"),
         ("kosmos-2", "Kosmos2Config"),
         ("kosmos-2.5", "Kosmos2_5Config"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextConfig"),
@@ -741,6 +742,7 @@
         ("jamba", "Jamba"),
         ("janus", "Janus"),
         ("jetmoe", "JetMoe"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3"),
         ("kosmos-2", "KOSMOS-2"),
         ("kosmos-2.5", "KOSMOS-2.5"),
         ("kyutai_speech_to_text", "KyutaiSpeechToText"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 764d3b770e86..afb0658a456c 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -234,6 +234,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("jamba", "JambaModel"),
         ("janus", "JanusModel"),
         ("jetmoe", "JetMoeModel"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3Model"),
         ("kosmos-2", "Kosmos2Model"),
         ("kosmos-2.5", "Kosmos2_5Model"),
         ("kyutai_speech_to_text", "KyutaiSpeechToTextModel"),
@@ -1049,6 +1050,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("fnet", "FNetForMaskedLM"),
         ("funnel", "FunnelForMaskedLM"),
         ("ibert", "IBertForMaskedLM"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3ForMaskedLM"),
         ("layoutlm", "LayoutLMForMaskedLM"),
         ("longformer", "LongformerForMaskedLM"),
         ("luke", "LukeForMaskedLM"),
@@ -1232,6 +1234,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("ibert", "IBertForSequenceClassification"),
         ("jamba", "JambaForSequenceClassification"),
         ("jetmoe", "JetMoeForSequenceClassification"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3ForSequenceClassification"),
         ("layoutlm", "LayoutLMForSequenceClassification"),
         ("layoutlmv2", "LayoutLMv2ForSequenceClassification"),
         ("layoutlmv3", "LayoutLMv3ForSequenceClassification"),
@@ -1331,6 +1334,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("gpt_neox", "GPTNeoXForQuestionAnswering"),
         ("gptj", "GPTJForQuestionAnswering"),
         ("ibert", "IBertForQuestionAnswering"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3ForQuestionAnswering"),
         ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"),
         ("layoutlmv3", "LayoutLMv3ForQuestionAnswering"),
         ("led", "LEDForQuestionAnswering"),
@@ -1447,6 +1451,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("gpt_oss", "GptOssForTokenClassification"),
         ("helium", "HeliumForTokenClassification"),
         ("ibert", "IBertForTokenClassification"),
+        ("jina_embeddings_v3", "JinaEmbeddingsV3ForTokenClassification"),
         ("layoutlm", "LayoutLMForTokenClassification"),
         ("layoutlmv2", "LayoutLMv2ForTokenClassification"),
         ("layoutlmv3", "LayoutLMv3ForTokenClassification"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 4faaa9844315..46076b7f223f 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -161,6 +161,7 @@
         ("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
         ("internvl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
         ("jais2", "GPT2Tokenizer" if is_tokenizers_available() else None),
+        ("jina_embeddings_v3", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
         ("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
         ("lasr_ctc", "LasrTokenizer" if is_tokenizers_available() else None),
         ("lasr_encoder", "LasrTokenizer" if is_tokenizers_available() else None),
diff --git a/src/transformers/models/jina_embeddings_v3/__init__.py b/src/transformers/models/jina_embeddings_v3/__init__.py
new file mode 100644
index 000000000000..c0c33e9ff015
--- /dev/null
+++ b/src/transformers/models/jina_embeddings_v3/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2026 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_jina_embeddings_v3 import *
+    from .modeling_jina_embeddings_v3 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py b/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py
new file mode 100644
index 000000000000..fea8d7adbf02
--- /dev/null
+++ b/src/transformers/models/jina_embeddings_v3/configuration_jina_embeddings_v3.py
@@ -0,0 +1,72 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/jina_embeddings_v3/modular_jina_embeddings_v3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_jina_embeddings_v3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Jina-AI and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.dataclasses import strict
+
+from ...configuration_utils import PreTrainedConfig
+from ...modeling_rope_utils import RopeParameters
+from ...utils import auto_docstring
+
+
+@auto_docstring(checkpoint="jinaai/jina-embeddings-v3-hf")
+@strict(accept_kwargs=True)
+class JinaEmbeddingsV3Config(PreTrainedConfig):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import JinaEmbeddingsV3Config, JinaEmbeddingsV3Model
+
+    >>> # Initializing a Jina-Embeddings-V3 jinaai/jina-embeddings-v3-hf style configuration
+    >>> configuration = JinaEmbeddingsV3Config()
+
+    >>> # Initializing a model (with random weights) from the jinaai/jina-embeddings-v3-hf style configuration
+    >>> model = JinaEmbeddingsV3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jina_embeddings_v3"
+
+    vocab_size: int = 250002
+    hidden_size: int = 1024
+    num_hidden_layers: int = 24
+    num_attention_heads: int = 16
+    intermediate_size: int = 4096
+    hidden_act: str = "gelu"
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+    max_position_embeddings: int = 8194
+    type_vocab_size: int = 1
+    initializer_range: float = 0.02
+    layer_norm_eps: float = 1e-5
+    pad_token_id: int | None = 1
+    bos_token_id: int | None = 0
+    eos_token_id: int | None = 2
+    use_cache: bool = True
+    classifier_dropout: float | int | None = None
+    tie_word_embeddings: bool = True
+    default_theta = 20000.0
+    rope_parameters: RopeParameters | dict | None = None
+
+
+__all__ = ["JinaEmbeddingsV3Config"]
diff --git a/src/transformers/models/jina_embeddings_v3/modeling_jina_embeddings_v3.py b/src/transformers/models/jina_embeddings_v3/modeling_jina_embeddings_v3.py
new file mode 100644
index 000000000000..547fb54fa245
--- /dev/null
+++ b/src/transformers/models/jina_embeddings_v3/modeling_jina_embeddings_v3.py
@@ -0,0 +1,822 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/jina_embeddings_v3/modular_jina_embeddings_v3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_jina_embeddings_v3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The Jina-AI and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+from typing import Optional
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ... import initialization as init
+from ...activations import ACT2FN, gelu
+from ...integrations import use_kernel_func_from_hub, use_kernelized_func
+from ...masking_utils import create_bidirectional_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import can_return_tuple, maybe_autocast, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
+from .configuration_jina_embeddings_v3 import JinaEmbeddingsV3Config
+
+
+class JinaEmbeddingsV3Embeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+    ) -> torch.Tensor:
+        embeddings = inputs_embeds
+        if inputs_embeds is None:
+            embeddings = self.word_embeddings(input_ids)
+
+        input_shape = embeddings.shape[:-1]
+        device = embeddings.device
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, : input_shape[1]]
+
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
+                buffered_token_type_ids = self.token_type_ids.expand(input_shape[0], -1)
+                buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
+                token_type_ids = buffered_token_type_ids
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class JinaEmbeddingsV3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: JinaEmbeddingsV3Config, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: JinaEmbeddingsV3Config | None = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int | None = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+
+        attention_factor = 1.0  # Unused in this type of RoPE
+
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+@use_kernel_func_from_hub("rotary_pos_emb")
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float | None = None,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+@use_kernelized_func(apply_rotary_pos_emb)
+class JinaEmbeddingsV3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__()
+        self.config = config
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_probs_dropout_prob
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class JinaEmbeddingsV3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class JinaEmbeddingsV3Layer(GradientCheckpointingLayer):
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__()
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.mlp = JinaEmbeddingsV3MLP(config)
+        self.self_attn = JinaEmbeddingsV3Attention(config=config)
+        self.post_mlp_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        attention_output, _ = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.post_attention_dropout(attention_output)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.post_mlp_dropout(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        return hidden_states
+
+
+class JinaEmbeddingsV3Pooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class JinaEmbeddingsV3PreTrainedModel(PreTrainedModel):
+    config_class = JinaEmbeddingsV3Config
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": JinaEmbeddingsV3Layer,
+        "attentions": JinaEmbeddingsV3Attention,
+    }
+
+    @torch.no_grad()
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        super()._init_weights(module)
+        if isinstance(module, JinaEmbeddingsV3LMHead):
+            init.zeros_(module.bias)
+        elif isinstance(module, JinaEmbeddingsV3Embeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
+
+
+@auto_docstring
+class JinaEmbeddingsV3Model(JinaEmbeddingsV3PreTrainedModel):
+    _no_split_modules = ["JinaEmbeddingsV3Embeddings", "JinaEmbeddingsV3Layer"]
+
+    def __init__(self, config: JinaEmbeddingsV3Config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.gradient_checkpointing = False
+
+        self.embeddings = JinaEmbeddingsV3Embeddings(config)
+
+        self.pooler = JinaEmbeddingsV3Pooler(config) if add_pooling_layer else None
+        self.rotary_emb = JinaEmbeddingsV3RotaryEmbedding(config)
+        self.layers = nn.ModuleList([JinaEmbeddingsV3Layer(config) for _ in range(config.num_hidden_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling | tuple:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        hidden_states = embedding_output
+
+        if position_ids is None:
+            # Default RoPE positions assume right padding; left padding requires explicit corrected position_ids for RoPE.
+            position_ids = torch.arange(hidden_states.shape[1], dtype=torch.long, device=hidden_states.device)
+            position_ids = position_ids.unsqueeze(0)
+
+        position_embeddings = self.rotary_emb(embedding_output, position_ids)
+
+        attention_mask = create_bidirectional_mask(
+            config=self.config,
+            inputs_embeds=embedding_output,
+            attention_mask=attention_mask,
+        )
+
+        for encoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        sequence_output = hidden_states
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+
+class JinaEmbeddingsV3LMHead(nn.Module):
+    """JinaEmbeddingsV3 Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+
+@auto_docstring
+class JinaEmbeddingsV3ForMaskedLM(JinaEmbeddingsV3PreTrainedModel):
+    _tied_weights_keys = {
+        "lm_head.decoder.weight": "roberta.embeddings.word_embeddings.weight",
+        "lm_head.decoder.bias": "lm_head.bias",
+    }
+
+    def __init__(self, config):
+        super().__init__(config=config)
+
+        self.lm_head = JinaEmbeddingsV3LMHead(config)
+        self.roberta = JinaEmbeddingsV3Model(config, add_pooling_layer=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class JinaEmbeddingsV3ClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+class JinaEmbeddingsV3ForSequenceClassification(JinaEmbeddingsV3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.classifier = JinaEmbeddingsV3ClassificationHead(config)
+
+        self.roberta = JinaEmbeddingsV3Model(config, add_pooling_layer=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class JinaEmbeddingsV3ForTokenClassification(JinaEmbeddingsV3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.roberta = JinaEmbeddingsV3Model(config, add_pooling_layer=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor] | TokenClassifierOutput:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+            **kwargs,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class JinaEmbeddingsV3ForQuestionAnswering(JinaEmbeddingsV3PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.roberta = JinaEmbeddingsV3Model(config, add_pooling_layer=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        start_positions: torch.LongTensor | None = None,
+        end_positions: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor] | QuestionAnsweringModelOutput:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        """
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+            **kwargs,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "JinaEmbeddingsV3PreTrainedModel",
+    "JinaEmbeddingsV3Model",
+    "JinaEmbeddingsV3ForMaskedLM",
+    "JinaEmbeddingsV3ForSequenceClassification",
+    "JinaEmbeddingsV3ForTokenClassification",
+    "JinaEmbeddingsV3ForQuestionAnswering",
+    "JinaEmbeddingsV3Layer",
+]
diff --git a/src/transformers/models/jina_embeddings_v3/modular_jina_embeddings_v3.py b/src/transformers/models/jina_embeddings_v3/modular_jina_embeddings_v3.py
new file mode 100644
index 000000000000..bb38c96b687e
--- /dev/null
+++ b/src/transformers/models/jina_embeddings_v3/modular_jina_embeddings_v3.py
@@ -0,0 +1,404 @@
+# Copyright 2026 The Jina-AI and HuggingFace Inc. teams. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable
+
+import torch
+from huggingface_hub.dataclasses import strict
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...integrations import use_kernelized_func
+from ...masking_utils import create_bidirectional_mask
+from ...modeling_outputs import (
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+)
+from ...modeling_rope_utils import RopeParameters
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, logging
+from ...utils.generic import can_return_tuple, merge_with_config_defaults
+from ...utils.output_capturing import capture_outputs
+from ..clip.modeling_clip import CLIPMLP
+from ..gpt_neox.modeling_gpt_neox import GPTNeoXLayer
+from ..llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding, apply_rotary_pos_emb
+from ..xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
+from ..xlm_roberta.modeling_xlm_roberta import (
+    XLMRobertaEmbeddings,
+    XLMRobertaForMaskedLM,
+    XLMRobertaForQuestionAnswering,
+    XLMRobertaForSequenceClassification,
+    XLMRobertaForTokenClassification,
+    XLMRobertaLMHead,
+    XLMRobertaModel,
+    XLMRobertaPooler,
+    XLMRobertaPreTrainedModel,
+    eager_attention_forward,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring(checkpoint="jinaai/jina-embeddings-v3-hf")
+@strict(accept_kwargs=True)
+class JinaEmbeddingsV3Config(XLMRobertaConfig):
+    r"""
+    Examples:
+
+    ```python
+    >>> from transformers import JinaEmbeddingsV3Config, JinaEmbeddingsV3Model
+
+    >>> # Initializing a Jina-Embeddings-V3 jinaai/jina-embeddings-v3-hf style configuration
+    >>> configuration = JinaEmbeddingsV3Config()
+
+    >>> # Initializing a model (with random weights) from the jinaai/jina-embeddings-v3-hf style configuration
+    >>> model = JinaEmbeddingsV3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "jina_embeddings_v3"
+    default_theta = 20000.0
+
+    vocab_size: int = 250002
+    hidden_size: int = 1024
+    num_hidden_layers: int = 24
+    num_attention_heads: int = 16
+    intermediate_size: int = 4096
+    max_position_embeddings: int = 8194
+    type_vocab_size: int = 1
+    layer_norm_eps: float = 1e-5
+    rope_parameters: RopeParameters | dict | None = None
+
+    add_cross_attention = AttributeError()
+    is_decoder = AttributeError()
+
+
+class JinaEmbeddingsV3Embeddings(XLMRobertaEmbeddings):
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__(config)
+
+        del self.padding_idx
+        del self.position_embeddings
+
+    def create_position_ids_from_inputs_embeds():
+        raise AttributeError("Not needed for JinaEmbeddingsV3")
+
+    def create_position_ids_from_input_ids():
+        raise AttributeError("Not needed for JinaEmbeddingsV3")
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+    ) -> torch.Tensor:
+        embeddings = inputs_embeds
+        if inputs_embeds is None:
+            embeddings = self.word_embeddings(input_ids)
+
+        input_shape = embeddings.shape[:-1]
+        device = embeddings.device
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, : input_shape[1]]
+
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
+                buffered_token_type_ids = self.token_type_ids.expand(input_shape[0], -1)
+                buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
+                token_type_ids = buffered_token_type_ids
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class JinaEmbeddingsV3RotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+@use_kernelized_func(apply_rotary_pos_emb)
+class JinaEmbeddingsV3Attention(LlamaAttention):
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__(config)
+        self.is_causal = False
+        self.attention_dropout = config.attention_probs_dropout_prob
+
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True)
+
+        del self.layer_idx
+        del self.num_key_value_groups
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
+            self.config._attn_implementation, eager_attention_forward
+        )
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class JinaEmbeddingsV3MLP(CLIPMLP):
+    pass
+
+
+class JinaEmbeddingsV3Layer(GPTNeoXLayer):
+    def __init__(self, config: JinaEmbeddingsV3Config):
+        super().__init__(config)
+        self.self_attn = JinaEmbeddingsV3Attention(config=config)
+
+        self.post_attention_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.post_mlp_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_mlp_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        del self.use_parallel_residual
+        del self.input_layernorm
+        del self.attention
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        attention_output, _ = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + self.post_attention_dropout(attention_output)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.post_mlp_dropout(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)
+        return hidden_states
+
+
+class JinaEmbeddingsV3Pooler(XLMRobertaPooler):
+    pass
+
+
+class JinaEmbeddingsV3PreTrainedModel(XLMRobertaPreTrainedModel):
+    _can_record_outputs = {
+        "hidden_states": JinaEmbeddingsV3Layer,
+        "attentions": JinaEmbeddingsV3Attention,
+    }
+
+
+@auto_docstring
+class JinaEmbeddingsV3Model(XLMRobertaModel):
+    def __init__(self, config: JinaEmbeddingsV3Config, add_pooling_layer=True):
+        super().__init__(config)
+        self.rotary_emb = JinaEmbeddingsV3RotaryEmbedding(config)
+        self.layers = nn.ModuleList([JinaEmbeddingsV3Layer(config) for _ in range(config.num_hidden_layers)])
+        del self.encoder
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @merge_with_config_defaults
+    @capture_outputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        token_type_ids: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPooling | tuple:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        hidden_states = embedding_output
+
+        if position_ids is None:
+            # Default RoPE positions assume right padding; left padding requires explicit corrected position_ids for RoPE.
+            position_ids = torch.arange(hidden_states.shape[1], dtype=torch.long, device=hidden_states.device)
+            position_ids = position_ids.unsqueeze(0)
+
+        position_embeddings = self.rotary_emb(embedding_output, position_ids)
+
+        attention_mask = create_bidirectional_mask(
+            config=self.config,
+            inputs_embeds=embedding_output,
+            attention_mask=attention_mask,
+        )
+
+        for encoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        sequence_output = hidden_states
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+        )
+
+    def _create_attention_masks(self):
+        raise AttributeError("Not needed for JinaEmbeddingsV3")
+
+
+class JinaEmbeddingsV3LMHead(XLMRobertaLMHead):
+    pass
+
+
+class JinaEmbeddingsV3ForMaskedLM(XLMRobertaForMaskedLM):
+    def __init__(self, config):
+        JinaEmbeddingsV3PreTrainedModel.__init__(self, config=config)
+
+        self.lm_head = JinaEmbeddingsV3LMHead(config)
+        self.roberta = JinaEmbeddingsV3Model(config, add_pooling_layer=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        attention_mask: torch.FloatTensor | None = None,
+        token_type_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor] | MaskedLMOutput:
+        r"""
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
+            >= 2. All the value in this tensor should be always < type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            return_dict=True,
+            **kwargs,
+        )
+        sequence_output = outputs[0]
+
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            # move labels to correct device
+            labels = labels.to(prediction_scores.device)
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class JinaEmbeddingsV3ForSequenceClassification(XLMRobertaForSequenceClassification):
+    pass
+
+
+class JinaEmbeddingsV3ForTokenClassification(XLMRobertaForTokenClassification):
+    pass
+
+
+class JinaEmbeddingsV3ForQuestionAnswering(XLMRobertaForQuestionAnswering):
+    pass
+
+
+__all__ = [
+    "JinaEmbeddingsV3Config",
+    "JinaEmbeddingsV3PreTrainedModel",
+    "JinaEmbeddingsV3Model",
+    "JinaEmbeddingsV3ForMaskedLM",
+    "JinaEmbeddingsV3ForSequenceClassification",
+    "JinaEmbeddingsV3ForTokenClassification",
+    "JinaEmbeddingsV3ForQuestionAnswering",
+    "JinaEmbeddingsV3Layer",
+]
diff --git a/tests/models/jina_embeddings_v3/__init__.py b/tests/models/jina_embeddings_v3/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/jina_embeddings_v3/test_modeling_jina_embeddings_v3.py b/tests/models/jina_embeddings_v3/test_modeling_jina_embeddings_v3.py
new file mode 100644
index 000000000000..a2395613ba4b
--- /dev/null
+++ b/tests/models/jina_embeddings_v3/test_modeling_jina_embeddings_v3.py
@@ -0,0 +1,432 @@
+# Copyright 2026 the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from transformers import AutoModel, AutoTokenizer, is_torch_available
+from transformers.models.jina_embeddings_v3 import JinaEmbeddingsV3Config
+from transformers.testing_utils import (
+    cleanup,
+    require_torch,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        JinaEmbeddingsV3ForMaskedLM,
+        JinaEmbeddingsV3ForQuestionAnswering,
+        JinaEmbeddingsV3ForSequenceClassification,
+        JinaEmbeddingsV3ForTokenClassification,
+        JinaEmbeddingsV3Model,
+    )
+
+
+class JinaEmbeddingsV3ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=32,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=20,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=8,
+        type_vocab_size=1,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+
+    def get_config(self):
+        return JinaEmbeddingsV3Config(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels):
+        model = JinaEmbeddingsV3Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+        result = model(input_ids, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_for_masked_lm(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+    ):
+        model = JinaEmbeddingsV3ForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_question_answering(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+    ):
+        model = JinaEmbeddingsV3ForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels
+    ):
+        config.num_labels = self.num_labels
+        model = JinaEmbeddingsV3ForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+    ):
+        config.num_labels = self.num_labels
+        model = JinaEmbeddingsV3ForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+        ) = config_and_inputs
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": input_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class JinaEmbeddingsV3ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            JinaEmbeddingsV3Model,
+            JinaEmbeddingsV3ForMaskedLM,
+            JinaEmbeddingsV3ForQuestionAnswering,
+            JinaEmbeddingsV3ForSequenceClassification,
+            JinaEmbeddingsV3ForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": JinaEmbeddingsV3Model,
+            "fill-mask": JinaEmbeddingsV3ForMaskedLM,
+            "text-classification": JinaEmbeddingsV3ForSequenceClassification,
+            "token-classification": JinaEmbeddingsV3ForTokenClassification,
+            "zero-shot": JinaEmbeddingsV3ForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+
+    def setUp(self):
+        self.model_tester = JinaEmbeddingsV3ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=JinaEmbeddingsV3Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+
+@require_torch
+class JinaEmbeddingsV3ModelIntegrationTest(unittest.TestCase):
+    model_id = "jinaai/jina-embeddings-v3-hf"
+    prompt = "Jina Embeddings V3 is great for semantic search."
+
+    def setup(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def tearDown(self):
+        cleanup(torch_device, gc_collect=True)
+
+    def _prepare_inputs(self):
+        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        inputs = tokenizer(self.prompt, return_tensors="pt", padding=True)
+        return inputs
+
+    @slow
+    def test_inference_no_head_absolute_embedding(self):
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.eval()
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        expected_shape = torch.Size((1, 17, 1024))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-3.1011, 0.8560, -0.2491, 0.9427, 1.4015, -1.1527, 1.3804, -0.5453, -1.8164],
+                    [-3.1108, 1.0107, -0.2097, 1.3495, 0.9984, -0.9518, 1.3189, -0.6295, -2.1128],
+                    [-2.7095, 0.6469, -0.4475, 1.1364, 1.5975, -0.7545, 1.0803, 0.5199, -2.3569],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)
+
+    @slow
+    def test_inference_retrieval_query_adapter(self):
+        task = "retrieval_query"
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.load_adapter(self.model_id, adapter_name=task, adapter_kwargs={"subfolder": task})
+        model.set_adapter(task)
+        model.eval()
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        self.assertEqual(output.shape, torch.Size((1, 17, 1024)))
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-1.9765, 0.7356, -0.4414, 0.5823, 2.1507, -0.8906, 0.0233, -0.2389, -1.5708],
+                    [-2.0078, 0.9562, -0.3315, 1.0080, 1.8247, -0.6678, -0.2505, -0.3441, -1.9328],
+                    [-1.9107, 0.7120, -0.4675, 0.9436, 2.1607, -0.4170, -0.1513, 1.0063, -2.0103],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)
+
+    @slow
+    def test_inference_retrieval_passage_adapter(self):
+        task = "retrieval_passage"
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.load_adapter(self.model_id, adapter_name=task, adapter_kwargs={"subfolder": task})
+        model.set_adapter(task)
+        model.eval()
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        expected_shape = torch.Size((1, 17, 1024))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-1.7028, 0.5688, -0.8541, 0.4696, 2.5396, -0.8374, -0.1404, -0.3123, -1.4636],
+                    [-1.6631, 0.6571, -0.8641, 0.9177, 2.3502, -0.6578, -0.3763, -0.3975, -1.7684],
+                    [-1.4739, 0.4739, -0.8745, 0.8812, 2.6848, -0.4496, -0.4964, 0.6403, -2.0821],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)
+
+    @slow
+    def test_inference_separation_adapter(self):
+        task = "separation"
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.load_adapter(self.model_id, adapter_name=task, adapter_kwargs={"subfolder": task})
+        model.set_adapter(task)
+        model.eval()
+
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        self.assertEqual(output.shape, torch.Size((1, 17, 1024)))
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-3.0336, 1.4392, 0.2875, 0.7660, 0.7054, -1.1701, 1.6121, -0.6325, -1.5177],
+                    [-3.0875, 1.5134, 0.3620, 1.0281, 0.4895, -1.0484, 1.6574, -0.7636, -1.6736],
+                    [-2.7605, 1.2920, 0.2223, 0.9895, 0.8515, -0.9050, 1.5558, 0.1410, -1.8531],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)
+
+    @slow
+    def test_inference_classification_adapter(self):
+        task = "classification"
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.load_adapter(self.model_id, adapter_name=task, adapter_kwargs={"subfolder": task})
+        model.set_adapter(task)
+        model.eval()
+
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        self.assertEqual(output.shape, torch.Size((1, 17, 1024)))
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-2.7150, 0.2485, 1.2297, 0.6988, 0.9804, -1.2831, 1.3446, -0.1663, -0.6874],
+                    [-2.8101, 0.1711, 1.2010, 0.9873, 0.5092, -1.3312, 1.4633, -0.2467, -0.7835],
+                    [-2.6067, 0.2362, 0.6945, 1.0134, 0.7105, -1.3767, 0.9999, 0.4427, -1.1153],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)
+
+    @slow
+    def test_inference_text_matching_adapter(self):
+        task = "text_matching"
+        model = AutoModel.from_pretrained(self.model_id, dtype=torch.float32)
+        model.load_adapter(self.model_id, adapter_name=task, adapter_kwargs={"subfolder": task})
+        model.set_adapter(task)
+        model.eval()
+
+        inputs = self._prepare_inputs()
+
+        with torch.no_grad():
+            output = model(**inputs)[0]
+
+        self.assertEqual(output.shape, torch.Size((1, 17, 1024)))
+        expected_slice = torch.tensor(
+            [
+                [
+                    [-1.5888, 1.0527, 0.1237, -0.0822, 1.6507, -1.0371, -0.8815, -0.8082, -0.6564],
+                    [-1.6529, 1.3143, 0.1957, 0.2914, 1.4897, -0.8735, -1.0067, -0.7544, -1.0513],
+                    [-1.5308, 1.4805, -0.1393, 0.3879, 1.4373, -0.6064, -1.6436, 0.4793, -1.3388],
+                ]
+            ]
+        )
+
+        torch.testing.assert_close(output[:, 1:4, 1:10], expected_slice, rtol=1e-4, atol=1e-4)