diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml
index eb1047e4..0d82b8a5 100644
--- a/.github/workflows/python-tests.yml
+++ b/.github/workflows/python-tests.yml
@@ -16,6 +16,13 @@ jobs:
         test-target: ["tests", "pylate"]
 
     steps:
+    - name: Cache Hugging Face Hub
+      id: cache-hf
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/huggingface
+        key: ${{ runner.os }}-hf
+
     - name: Checkout code
       uses: actions/checkout@v3
 
diff --git a/docs/api/indexes/Voyager.md b/docs/api/indexes/Voyager.md
index b2654aa0..e9e9ce61 100644
--- a/docs/api/indexes/Voyager.md
+++ b/docs/api/indexes/Voyager.md
@@ -91,32 +91,31 @@ Voyager index. The Voyager index is a fast and efficient index for approximate n
 
     **Parameters**
 
-    - **queries_embeddings**     (*numpy.ndarray | torch.Tensor*)    
-    - **k**     (*int*)     – defaults to `10`    
-    
+    - **queries_embeddings**     (*numpy.ndarray | torch.Tensor*)
+    - **k**     (*int*)     – defaults to `10`
+
 ???- note "add_documents"
 
     Add documents to the index.
 
     **Parameters**
 
-    - **documents_ids**     (*str | list[str]*)    
-    - **documents_embeddings**     (*list[numpy.ndarray | torch.Tensor]*)    
-    - **batch_size**     (*int*)     – defaults to `2000`    
-    
+    - **documents_ids**     (*str | list[str]*)
+    - **documents_embeddings**     (*list[numpy.ndarray | torch.Tensor]*)
+    - **batch_size**     (*int*)     – defaults to `2000`
+
 ???- note "get_documents_embeddings"
 
     Retrieve document embeddings for re-ranking from Voyager.
 
     **Parameters**
 
-    - **document_ids**     (*list[list[str]]*)    
-    
+    - **document_ids**     (*list[list[str]]*)
+
 ???- note "remove_documents"
 
     Remove documents from the index.
 
     **Parameters**
 
-    - **documents_ids**     (*list[str]*)    
-    
+    - **documents_ids**     (*list[str]*)
diff --git a/docs/api/losses/Contrastive.md b/docs/api/losses/Contrastive.md
index 9f838df4..678dfdfc 100644
--- a/docs/api/losses/Contrastive.md
+++ b/docs/api/losses/Contrastive.md
@@ -57,9 +57,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "add_module"
 
     Add a child module to the current module.
@@ -68,9 +68,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "apply"
 
     Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
@@ -79,15 +79,15 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)    
-    
+    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)
+
 ???- note "bfloat16"
 
     Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "buffers"
 
     Return an iterator over module buffers.
@@ -96,15 +96,15 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "children"
 
     Return an iterator over immediate children modules.
 
     Yields:     Module: a child module
 
-    
+
 ???- note "compile"
 
     Compile this Module's forward using :func:`torch.compile`.
@@ -113,16 +113,16 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "cpu"
 
     Move all model parameters and buffers to the CPU.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "cuda"
 
     Move all model parameters and buffers to the GPU.
@@ -131,45 +131,45 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "double"
 
     Casts all floating point parameters and buffers to ``double`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "eval"
 
     Set the module in evaluation mode.
 
     This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.  This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.  See :ref:`locally-disable-grad-doc` for a comparison between `.eval()` and several similar mechanisms that may be confused with it.  Returns:     Module: self
 
-    
+
 ???- note "extra_repr"
 
     Set the extra representation of the module.
 
     To print customized extra information, you should re-implement this method in your own modules. Both single-line and multi-line strings are acceptable.
 
-    
+
 ???- note "float"
 
     Casts all floating point parameters and buffers to ``float`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "forward"
 
     Compute the Constrastive loss.
 
     **Parameters**
 
-    - **sentence_features**     (*Iterable[dict[str, torch.Tensor]]*)    
-    - **labels**     (*torch.Tensor | None*)     – defaults to `None`    
-    
+    - **sentence_features**     (*Iterable[dict[str, torch.Tensor]]*)
+    - **labels**     (*torch.Tensor | None*)     – defaults to `None`
+
 ???- note "get_buffer"
 
     Return the buffer given by ``target`` if it exists, otherwise throw an error.
@@ -178,15 +178,15 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_extra_state"
 
     Return any extra state to include in the module's state_dict.
 
     Implement this and a corresponding :func:`set_extra_state` for your module if you need to store extra state. This function is called when building the module's `state_dict()`.  Note that extra state should be picklable to ensure working serialization of the state_dict. We only provide provide backwards compatibility guarantees for serializing Tensors; other objects may break backwards compatibility if their serialized pickled form changes.  Returns:     object: Any extra state to store in the module's state_dict
 
-    
+
 ???- note "get_parameter"
 
     Return the parameter given by ``target`` if it exists, otherwise throw an error.
@@ -195,8 +195,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_submodule"
 
     Return the submodule given by ``target`` if it exists, otherwise throw an error.
@@ -205,15 +205,15 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "half"
 
     Casts all floating point parameters and buffers to ``half`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "ipu"
 
     Move all model parameters and buffers to the IPU.
@@ -222,8 +222,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "load_state_dict"
 
     Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
@@ -232,17 +232,17 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **state_dict**     (*Mapping[str, Any]*)    
-    - **strict**     (*bool*)     – defaults to `True`    
-    - **assign**     (*bool*)     – defaults to `False`    
-    
+    - **state_dict**     (*Mapping[str, Any]*)
+    - **strict**     (*bool*)     – defaults to `True`
+    - **assign**     (*bool*)     – defaults to `False`
+
 ???- note "modules"
 
     Return an iterator over all modules in the network.
 
     Yields:     Module: a module in the network  Note:     Duplicate modules are returned only once. In the following     example, ``l`` will be returned only once.  Example::      >>> l = nn.Linear(2, 2)     >>> net = nn.Sequential(l, l)     >>> for idx, m in enumerate(net.modules()):     ...     print(idx, '->', m)      0 -> Sequential(       (0): Linear(in_features=2, out_features=2, bias=True)       (1): Linear(in_features=2, out_features=2, bias=True)     )     1 -> Linear(in_features=2, out_features=2, bias=True)
 
-    
+
 ???- note "named_buffers"
 
     Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
@@ -251,17 +251,17 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_children"
 
     Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
 
     Yields:     (str, Module): Tuple containing a name and child module  Example::      >>> # xdoctest: +SKIP("undefined vars")     >>> for name, module in model.named_children():     >>>     if name in ['conv4', 'conv5']:     >>>         print(module)
 
-    
+
 ???- note "named_modules"
 
     Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
@@ -270,10 +270,10 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`    
-    - **prefix**     (*str*)     – defaults to ``    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`
+    - **prefix**     (*str*)     – defaults to ``
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_parameters"
 
     Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
@@ -282,10 +282,10 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "parameters"
 
     Return an iterator over module parameters.
@@ -294,8 +294,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "register_backward_hook"
 
     Register a backward hook on the module.
@@ -304,8 +304,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+
 ???- note "register_buffer"
 
     Add a buffer to the module.
@@ -314,10 +314,10 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **tensor**     (*Optional[torch.Tensor]*)    
-    - **persistent**     (*bool*)     – defaults to `True`    
-    
+    - **name**     (*str*)
+    - **tensor**     (*Optional[torch.Tensor]*)
+    - **persistent**     (*bool*)     – defaults to `True`
+
 ???- note "register_forward_hook"
 
     Register a forward hook on the module.
@@ -326,11 +326,11 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    - **always_call**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+    - **always_call**     (*bool*)     – defaults to `False`
+
 ???- note "register_forward_pre_hook"
 
     Register a forward pre-hook on the module.
@@ -339,10 +339,10 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_hook"
 
     Register a backward hook on the module.
@@ -351,9 +351,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_pre_hook"
 
     Register a backward pre-hook on the module.
@@ -362,9 +362,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_load_state_dict_post_hook"
 
     Register a post hook to be run after module's ``load_state_dict`` is called.
@@ -373,17 +373,17 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "register_module"
 
     Alias for :func:`add_module`.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "register_parameter"
 
     Add a parameter to the module.
@@ -392,9 +392,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **param**     (*Optional[torch.nn.parameter.Parameter]*)    
-    
+    - **name**     (*str*)
+    - **param**     (*Optional[torch.nn.parameter.Parameter]*)
+
 ???- note "register_state_dict_pre_hook"
 
     Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
@@ -403,8 +403,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "requires_grad_"
 
     Change if autograd should record operations on parameters in this module.
@@ -413,8 +413,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **requires_grad**     (*bool*)     – defaults to `True`    
-    
+    - **requires_grad**     (*bool*)     – defaults to `True`
+
 ???- note "set_extra_state"
 
     Set extra state contained in the loaded `state_dict`.
@@ -423,13 +423,13 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **state**     (*Any*)    
-    
+    - **state**     (*Any*)
+
 ???- note "share_memory"
 
     See :meth:`torch.Tensor.share_memory_`.
 
-    
+
 ???- note "state_dict"
 
     Return a dictionary containing references to the whole state of the module.
@@ -438,11 +438,11 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **args**    
-    - **destination**     – defaults to `None`    
-    - **prefix**     – defaults to ``    
-    - **keep_vars**     – defaults to `False`    
-    
+    - **args**
+    - **destination**     – defaults to `None`
+    - **prefix**     – defaults to ``
+    - **keep_vars**     – defaults to `False`
+
 ???- note "to"
 
     Move and/or cast the parameters and buffers.
@@ -451,9 +451,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "to_empty"
 
     Move the parameters and buffers to the specified device without copying storage.
@@ -462,9 +462,9 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **device**     (*Union[int, str, torch.device, NoneType]*)    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **device**     (*Union[int, str, torch.device, NoneType]*)
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "train"
 
     Set the module in training mode.
@@ -473,8 +473,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **mode**     (*bool*)     – defaults to `True`    
-    
+    - **mode**     (*bool*)     – defaults to `True`
+
 ???- note "type"
 
     Casts all parameters and buffers to :attr:`dst_type`.
@@ -483,8 +483,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **dst_type**     (*Union[torch.dtype, str]*)    
-    
+    - **dst_type**     (*Union[torch.dtype, str]*)
+
 ???- note "xpu"
 
     Move all model parameters and buffers to the XPU.
@@ -493,8 +493,8 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "zero_grad"
 
     Reset gradients of all model parameters.
@@ -503,5 +503,4 @@ Contrastive loss. Expects as input two texts and a label of either 0 or 1. If th
 
     **Parameters**
 
-    - **set_to_none**     (*bool*)     – defaults to `True`    
-    
+    - **set_to_none**     (*bool*)     – defaults to `True`
diff --git a/docs/api/losses/Distillation.md b/docs/api/losses/Distillation.md
index 824b8205..df4815e0 100644
--- a/docs/api/losses/Distillation.md
+++ b/docs/api/losses/Distillation.md
@@ -61,9 +61,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "add_module"
 
     Add a child module to the current module.
@@ -72,9 +72,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "apply"
 
     Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
@@ -83,15 +83,15 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)    
-    
+    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)
+
 ???- note "bfloat16"
 
     Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "buffers"
 
     Return an iterator over module buffers.
@@ -100,15 +100,15 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "children"
 
     Return an iterator over immediate children modules.
 
     Yields:     Module: a child module
 
-    
+
 ???- note "compile"
 
     Compile this Module's forward using :func:`torch.compile`.
@@ -117,16 +117,16 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "cpu"
 
     Move all model parameters and buffers to the CPU.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "cuda"
 
     Move all model parameters and buffers to the GPU.
@@ -135,45 +135,45 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "double"
 
     Casts all floating point parameters and buffers to ``double`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "eval"
 
     Set the module in evaluation mode.
 
     This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.  This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.  See :ref:`locally-disable-grad-doc` for a comparison between `.eval()` and several similar mechanisms that may be confused with it.  Returns:     Module: self
 
-    
+
 ???- note "extra_repr"
 
     Set the extra representation of the module.
 
     To print customized extra information, you should re-implement this method in your own modules. Both single-line and multi-line strings are acceptable.
 
-    
+
 ???- note "float"
 
     Casts all floating point parameters and buffers to ``float`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "forward"
 
     Computes the distillation loss with respect to SentenceTransformer.
 
     **Parameters**
 
-    - **sentence_features**     (*Iterable[dict[str, torch.Tensor]]*)    
-    - **labels**     (*torch.Tensor*)    
-    
+    - **sentence_features**     (*Iterable[dict[str, torch.Tensor]]*)
+    - **labels**     (*torch.Tensor*)
+
 ???- note "get_buffer"
 
     Return the buffer given by ``target`` if it exists, otherwise throw an error.
@@ -182,15 +182,15 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_extra_state"
 
     Return any extra state to include in the module's state_dict.
 
     Implement this and a corresponding :func:`set_extra_state` for your module if you need to store extra state. This function is called when building the module's `state_dict()`.  Note that extra state should be picklable to ensure working serialization of the state_dict. We only provide provide backwards compatibility guarantees for serializing Tensors; other objects may break backwards compatibility if their serialized pickled form changes.  Returns:     object: Any extra state to store in the module's state_dict
 
-    
+
 ???- note "get_parameter"
 
     Return the parameter given by ``target`` if it exists, otherwise throw an error.
@@ -199,8 +199,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_submodule"
 
     Return the submodule given by ``target`` if it exists, otherwise throw an error.
@@ -209,15 +209,15 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "half"
 
     Casts all floating point parameters and buffers to ``half`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "ipu"
 
     Move all model parameters and buffers to the IPU.
@@ -226,8 +226,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "load_state_dict"
 
     Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
@@ -236,17 +236,17 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **state_dict**     (*Mapping[str, Any]*)    
-    - **strict**     (*bool*)     – defaults to `True`    
-    - **assign**     (*bool*)     – defaults to `False`    
-    
+    - **state_dict**     (*Mapping[str, Any]*)
+    - **strict**     (*bool*)     – defaults to `True`
+    - **assign**     (*bool*)     – defaults to `False`
+
 ???- note "modules"
 
     Return an iterator over all modules in the network.
 
     Yields:     Module: a module in the network  Note:     Duplicate modules are returned only once. In the following     example, ``l`` will be returned only once.  Example::      >>> l = nn.Linear(2, 2)     >>> net = nn.Sequential(l, l)     >>> for idx, m in enumerate(net.modules()):     ...     print(idx, '->', m)      0 -> Sequential(       (0): Linear(in_features=2, out_features=2, bias=True)       (1): Linear(in_features=2, out_features=2, bias=True)     )     1 -> Linear(in_features=2, out_features=2, bias=True)
 
-    
+
 ???- note "named_buffers"
 
     Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
@@ -255,17 +255,17 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_children"
 
     Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
 
     Yields:     (str, Module): Tuple containing a name and child module  Example::      >>> # xdoctest: +SKIP("undefined vars")     >>> for name, module in model.named_children():     >>>     if name in ['conv4', 'conv5']:     >>>         print(module)
 
-    
+
 ???- note "named_modules"
 
     Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
@@ -274,10 +274,10 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`    
-    - **prefix**     (*str*)     – defaults to ``    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`
+    - **prefix**     (*str*)     – defaults to ``
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_parameters"
 
     Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
@@ -286,10 +286,10 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "parameters"
 
     Return an iterator over module parameters.
@@ -298,8 +298,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "register_backward_hook"
 
     Register a backward hook on the module.
@@ -308,8 +308,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+
 ???- note "register_buffer"
 
     Add a buffer to the module.
@@ -318,10 +318,10 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **tensor**     (*Optional[torch.Tensor]*)    
-    - **persistent**     (*bool*)     – defaults to `True`    
-    
+    - **name**     (*str*)
+    - **tensor**     (*Optional[torch.Tensor]*)
+    - **persistent**     (*bool*)     – defaults to `True`
+
 ???- note "register_forward_hook"
 
     Register a forward hook on the module.
@@ -330,11 +330,11 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    - **always_call**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+    - **always_call**     (*bool*)     – defaults to `False`
+
 ???- note "register_forward_pre_hook"
 
     Register a forward pre-hook on the module.
@@ -343,10 +343,10 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_hook"
 
     Register a backward hook on the module.
@@ -355,9 +355,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_pre_hook"
 
     Register a backward pre-hook on the module.
@@ -366,9 +366,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_load_state_dict_post_hook"
 
     Register a post hook to be run after module's ``load_state_dict`` is called.
@@ -377,17 +377,17 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "register_module"
 
     Alias for :func:`add_module`.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "register_parameter"
 
     Add a parameter to the module.
@@ -396,9 +396,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **param**     (*Optional[torch.nn.parameter.Parameter]*)    
-    
+    - **name**     (*str*)
+    - **param**     (*Optional[torch.nn.parameter.Parameter]*)
+
 ???- note "register_state_dict_pre_hook"
 
     Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
@@ -407,8 +407,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "requires_grad_"
 
     Change if autograd should record operations on parameters in this module.
@@ -417,8 +417,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **requires_grad**     (*bool*)     – defaults to `True`    
-    
+    - **requires_grad**     (*bool*)     – defaults to `True`
+
 ???- note "set_extra_state"
 
     Set extra state contained in the loaded `state_dict`.
@@ -427,13 +427,13 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **state**     (*Any*)    
-    
+    - **state**     (*Any*)
+
 ???- note "share_memory"
 
     See :meth:`torch.Tensor.share_memory_`.
 
-    
+
 ???- note "state_dict"
 
     Return a dictionary containing references to the whole state of the module.
@@ -442,11 +442,11 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **args**    
-    - **destination**     – defaults to `None`    
-    - **prefix**     – defaults to ``    
-    - **keep_vars**     – defaults to `False`    
-    
+    - **args**
+    - **destination**     – defaults to `None`
+    - **prefix**     – defaults to ``
+    - **keep_vars**     – defaults to `False`
+
 ???- note "to"
 
     Move and/or cast the parameters and buffers.
@@ -455,9 +455,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "to_empty"
 
     Move the parameters and buffers to the specified device without copying storage.
@@ -466,9 +466,9 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **device**     (*Union[int, str, torch.device, NoneType]*)    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **device**     (*Union[int, str, torch.device, NoneType]*)
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "train"
 
     Set the module in training mode.
@@ -477,8 +477,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **mode**     (*bool*)     – defaults to `True`    
-    
+    - **mode**     (*bool*)     – defaults to `True`
+
 ???- note "type"
 
     Casts all parameters and buffers to :attr:`dst_type`.
@@ -487,8 +487,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **dst_type**     (*Union[torch.dtype, str]*)    
-    
+    - **dst_type**     (*Union[torch.dtype, str]*)
+
 ???- note "xpu"
 
     Move all model parameters and buffers to the XPU.
@@ -497,8 +497,8 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "zero_grad"
 
     Reset gradients of all model parameters.
@@ -507,5 +507,4 @@ Distillation loss for ColBERT model. The loss is computed with respect to the fo
 
     **Parameters**
 
-    - **set_to_none**     (*bool*)     – defaults to `True`    
-    
+    - **set_to_none**     (*bool*)     – defaults to `True`
diff --git a/docs/api/models/ColBERT.md b/docs/api/models/ColBERT.md
index f877349a..486e4db7 100644
--- a/docs/api/models/ColBERT.md
+++ b/docs/api/models/ColBERT.md
@@ -197,9 +197,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "active_adapter"
 
 ???- note "active_adapters"
@@ -208,7 +208,7 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     Gets the current active adapters of the model. In case of multi-adapter inference (combining multiple adapters for inference) returns the list of all active adapters so that users can deal with them accordingly.  For previous PEFT versions (that does not support multi-adapter inference), `module.active_adapter` will return a single string.
 
-    
+
 ???- note "add_adapter"
 
     Adds a fresh new adapter to the current model for training purposes. If no adapter name is passed, a default name is assigned to the adapter to follow the convention of PEFT library (in PEFT we use "default" as the default adapter name).
@@ -217,9 +217,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "add_module"
 
     Add a child module to the current module.
@@ -228,9 +228,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "append"
 
     Append a given module to the end.
@@ -239,8 +239,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **module**     (*torch.nn.modules.module.Module*)    
-    
+    - **module**     (*torch.nn.modules.module.Module*)
+
 ???- note "apply"
 
     Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
@@ -249,15 +249,15 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)    
-    
+    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)
+
 ???- note "bfloat16"
 
     Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "buffers"
 
     Return an iterator over module buffers.
@@ -266,8 +266,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "check_peft_compatible_model"
 
 ???- note "children"
@@ -276,7 +276,7 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     Yields:     Module: a child module
 
-    
+
 ???- note "compile"
 
     Compile this Module's forward using :func:`torch.compile`.
@@ -285,16 +285,16 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "cpu"
 
     Move all model parameters and buffers to the CPU.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "cuda"
 
     Move all model parameters and buffers to the GPU.
@@ -303,74 +303,74 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
         Device (like "cuda", "cpu", "mps", "npu") that should be used for computation. If None, checks if a GPU can be used.
-    
+
 ???- note "disable_adapters"
 
     Disable all adapters that are attached to the model. This leads to inferring with the base model only.
 
-    
+
 ???- note "double"
 
     Casts all floating point parameters and buffers to ``double`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "enable_adapters"
 
     Enable adapters that are attached to the model. The model will use `self.active_adapter()`
 
-    
+
 ???- note "encode"
 
     Computes sentence embeddings.
 
     **Parameters**
 
-    - **sentences**     (*str | list[str]*)    
-    - **prompt_name**     (*str | None*)     – defaults to `None`    
-    - **prompt**     (*str | None*)     – defaults to `None`    
-    - **batch_size**     (*int*)     – defaults to `32`    
-    - **show_progress_bar**     (*bool*)     – defaults to `None`    
-    - **precision**     (*Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']*)     – defaults to `float32`    
-    - **convert_to_numpy**     (*bool*)     – defaults to `True`    
-    - **convert_to_tensor**     (*bool*)     – defaults to `False`    
-    - **padding**     (*bool*)     – defaults to `False`    
-    - **device**     (*str*)     – defaults to `None`    
+    - **sentences**     (*str | list[str]*)
+    - **prompt_name**     (*str | None*)     – defaults to `None`
+    - **prompt**     (*str | None*)     – defaults to `None`
+    - **batch_size**     (*int*)     – defaults to `32`
+    - **show_progress_bar**     (*bool*)     – defaults to `None`
+    - **precision**     (*Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']*)     – defaults to `float32`
+    - **convert_to_numpy**     (*bool*)     – defaults to `True`
+    - **convert_to_tensor**     (*bool*)     – defaults to `False`
+    - **padding**     (*bool*)     – defaults to `False`
+    - **device**     (*str*)     – defaults to `None`
         Device (like "cuda", "cpu", "mps", "npu") that should be used for computation. If None, checks if a GPU can be used.
-    - **normalize_embeddings**     (*bool*)     – defaults to `True`    
-    - **is_query**     (*bool*)     – defaults to `True`    
-    - **pool_factor**     (*int*)     – defaults to `1`    
-    - **protected_tokens**     (*int*)     – defaults to `1`    
-    
+    - **normalize_embeddings**     (*bool*)     – defaults to `True`
+    - **is_query**     (*bool*)     – defaults to `True`
+    - **pool_factor**     (*int*)     – defaults to `1`
+    - **protected_tokens**     (*int*)     – defaults to `1`
+
 ???- note "encode_multi_process"
 
     Encodes a list of sentences using multiple processes and GPUs via :meth:`SentenceTransformer.encode <sentence_transformers.SentenceTransformer.encode>`. The sentences are chunked into smaller packages and sent to individual processes, which encode them on different GPUs or CPUs. This method is only suitable for encoding large sets of sentences.
 
     **Parameters**
 
-    - **sentences**     (*list[str]*)    
-    - **pool**     (*dict[str, object]*)    
-    - **prompt_name**     (*str | None*)     – defaults to `None`    
-    - **prompt**     (*str | None*)     – defaults to `None`    
-    - **batch_size**     (*int*)     – defaults to `32`    
-    - **chunk_size**     (*int*)     – defaults to `None`    
-    - **precision**     (*Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']*)     – defaults to `float32`    
-    - **normalize_embeddings**     (*bool*)     – defaults to `True`    
-    - **padding**     (*bool*)     – defaults to `False`    
-    - **is_query**     (*bool*)     – defaults to `True`    
-    - **pool_factor**     (*int*)     – defaults to `1`    
-    - **protected_tokens**     (*int*)     – defaults to `1`    
-    
+    - **sentences**     (*list[str]*)
+    - **pool**     (*dict[str, object]*)
+    - **prompt_name**     (*str | None*)     – defaults to `None`
+    - **prompt**     (*str | None*)     – defaults to `None`
+    - **batch_size**     (*int*)     – defaults to `32`
+    - **chunk_size**     (*int*)     – defaults to `None`
+    - **precision**     (*Literal['float32', 'int8', 'uint8', 'binary', 'ubinary']*)     – defaults to `float32`
+    - **normalize_embeddings**     (*bool*)     – defaults to `True`
+    - **padding**     (*bool*)     – defaults to `False`
+    - **is_query**     (*bool*)     – defaults to `True`
+    - **pool_factor**     (*int*)     – defaults to `1`
+    - **protected_tokens**     (*int*)     – defaults to `1`
+
 ???- note "eval"
 
     Set the module in evaluation mode.
 
     This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.  This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.  See :ref:`locally-disable-grad-doc` for a comparison between `.eval()` and several similar mechanisms that may be confused with it.  Returns:     Module: self
 
-    
+
 ???- note "evaluate"
 
     Evaluate the model based on an evaluator
@@ -379,9 +379,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **evaluator**     (*'SentenceEvaluator'*)    
-    - **output_path**     (*'str'*)     – defaults to `None`    
-    
+    - **evaluator**     (*'SentenceEvaluator'*)
+    - **output_path**     (*'str'*)     – defaults to `None`
+
 ???- note "extend"
 
 ???- note "extra_repr"
@@ -390,7 +390,7 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     To print customized extra information, you should re-implement this method in your own modules. Both single-line and multi-line strings are acceptable.
 
-    
+
 ???- note "fit"
 
     Deprecated training method from before Sentence Transformers v3.0, it is recommended to use :class:`~sentence_transformers.trainer.SentenceTransformerTrainer` instead. This method uses :class:`~sentence_transformers.trainer.SentenceTransformerTrainer` behind the scenes, but does not provide as much flexibility as the Trainer itself.
@@ -399,33 +399,33 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **train_objectives**     (*'Iterable[tuple[DataLoader, nn.Module]]'*)    
-    - **evaluator**     (*'SentenceEvaluator'*)     – defaults to `None`    
-    - **epochs**     (*'int'*)     – defaults to `1`    
-    - **steps_per_epoch**     – defaults to `None`    
-    - **scheduler**     (*'str'*)     – defaults to `WarmupLinear`    
-    - **warmup_steps**     (*'int'*)     – defaults to `10000`    
-    - **optimizer_class**     (*'type[Optimizer]'*)     – defaults to `<class 'torch.optim.adamw.AdamW'>`    
-    - **optimizer_params**     (*'dict[str, object]'*)     – defaults to `{'lr': 2e-05}`    
-    - **weight_decay**     (*'float'*)     – defaults to `0.01`    
-    - **evaluation_steps**     (*'int'*)     – defaults to `0`    
-    - **output_path**     (*'str'*)     – defaults to `None`    
-    - **save_best_model**     (*'bool'*)     – defaults to `True`    
-    - **max_grad_norm**     (*'float'*)     – defaults to `1`    
-    - **use_amp**     (*'bool'*)     – defaults to `False`    
-    - **callback**     (*'Callable[[float, int, int], None]'*)     – defaults to `None`    
-    - **show_progress_bar**     (*'bool'*)     – defaults to `True`    
-    - **checkpoint_path**     (*'str'*)     – defaults to `None`    
-    - **checkpoint_save_steps**     (*'int'*)     – defaults to `500`    
-    - **checkpoint_save_total_limit**     (*'int'*)     – defaults to `0`    
-    
+    - **train_objectives**     (*'Iterable[tuple[DataLoader, nn.Module]]'*)
+    - **evaluator**     (*'SentenceEvaluator'*)     – defaults to `None`
+    - **epochs**     (*'int'*)     – defaults to `1`
+    - **steps_per_epoch**     – defaults to `None`
+    - **scheduler**     (*'str'*)     – defaults to `WarmupLinear`
+    - **warmup_steps**     (*'int'*)     – defaults to `10000`
+    - **optimizer_class**     (*'type[Optimizer]'*)     – defaults to `<class 'torch.optim.adamw.AdamW'>`
+    - **optimizer_params**     (*'dict[str, object]'*)     – defaults to `{'lr': 2e-05}`
+    - **weight_decay**     (*'float'*)     – defaults to `0.01`
+    - **evaluation_steps**     (*'int'*)     – defaults to `0`
+    - **output_path**     (*'str'*)     – defaults to `None`
+    - **save_best_model**     (*'bool'*)     – defaults to `True`
+    - **max_grad_norm**     (*'float'*)     – defaults to `1`
+    - **use_amp**     (*'bool'*)     – defaults to `False`
+    - **callback**     (*'Callable[[float, int, int], None]'*)     – defaults to `None`
+    - **show_progress_bar**     (*'bool'*)     – defaults to `True`
+    - **checkpoint_path**     (*'str'*)     – defaults to `None`
+    - **checkpoint_save_steps**     (*'int'*)     – defaults to `500`
+    - **checkpoint_save_total_limit**     (*'int'*)     – defaults to `0`
+
 ???- note "float"
 
     Casts all floating point parameters and buffers to ``float`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "forward"
 
     Define the computation performed at every call.
@@ -434,9 +434,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **input**     (*'dict[str, Tensor]'*)    
-    - **kwargs**    
-    
+    - **input**     (*'dict[str, Tensor]'*)
+    - **kwargs**
+
 ???- note "get_adapter_state_dict"
 
     If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT official documentation: https://huggingface.co/docs/peft
@@ -445,16 +445,16 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "get_backend"
 
     Return the backend used for inference, which can be one of "torch", "onnx", or "openvino".
 
     Returns:     str: The backend used for inference.
 
-    
+
 ???- note "get_buffer"
 
     Return the buffer given by ``target`` if it exists, otherwise throw an error.
@@ -463,22 +463,22 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_extra_state"
 
     Return any extra state to include in the module's state_dict.
 
     Implement this and a corresponding :func:`set_extra_state` for your module if you need to store extra state. This function is called when building the module's `state_dict()`.  Note that extra state should be picklable to ensure working serialization of the state_dict. We only provide provide backwards compatibility guarantees for serializing Tensors; other objects may break backwards compatibility if their serialized pickled form changes.  Returns:     object: Any extra state to store in the module's state_dict
 
-    
+
 ???- note "get_max_seq_length"
 
     Returns the maximal sequence length that the model accepts. Longer inputs will be truncated.
 
     Returns:     Optional[int]: The maximal sequence length that the model accepts, or None if it is not defined.
 
-    
+
 ???- note "get_parameter"
 
     Return the parameter given by ``target`` if it exists, otherwise throw an error.
@@ -487,15 +487,15 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_sentence_embedding_dimension"
 
     Returns the number of dimensions in the output of :meth:`SentenceTransformer.encode <sentence_transformers.SentenceTransformer.encode>`.
 
     Returns:     Optional[int]: The number of dimensions in the output of `encode`. If it's not known, it's `None`.
 
-    
+
 ???- note "get_sentence_features"
 
 ???- note "get_submodule"
@@ -506,8 +506,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "gradient_checkpointing_enable"
 
 ???- note "half"
@@ -516,7 +516,7 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "has_peft_compatible_model"
 
 ???- note "insert"
@@ -527,9 +527,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **input_ids**     (*torch.Tensor*)    
-    - **prefix_id**     (*int*)    
-    
+    - **input_ids**     (*torch.Tensor*)
+    - **prefix_id**     (*int*)
+
 ???- note "ipu"
 
     Move all model parameters and buffers to the IPU.
@@ -538,9 +538,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
         Device (like "cuda", "cpu", "mps", "npu") that should be used for computation. If None, checks if a GPU can be used.
-    
+
 ???- note "load"
 
 ???- note "load_adapter"
@@ -551,9 +551,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "load_state_dict"
 
     Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
@@ -562,17 +562,17 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **state_dict**     (*Mapping[str, Any]*)    
-    - **strict**     (*bool*)     – defaults to `True`    
-    - **assign**     (*bool*)     – defaults to `False`    
-    
+    - **state_dict**     (*Mapping[str, Any]*)
+    - **strict**     (*bool*)     – defaults to `True`
+    - **assign**     (*bool*)     – defaults to `False`
+
 ???- note "modules"
 
     Return an iterator over all modules in the network.
 
     Yields:     Module: a module in the network  Note:     Duplicate modules are returned only once. In the following     example, ``l`` will be returned only once.  Example::      >>> l = nn.Linear(2, 2)     >>> net = nn.Sequential(l, l)     >>> for idx, m in enumerate(net.modules()):     ...     print(idx, '->', m)      0 -> Sequential(       (0): Linear(in_features=2, out_features=2, bias=True)       (1): Linear(in_features=2, out_features=2, bias=True)     )     1 -> Linear(in_features=2, out_features=2, bias=True)
 
-    
+
 ???- note "named_buffers"
 
     Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
@@ -581,17 +581,17 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_children"
 
     Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
 
     Yields:     (str, Module): Tuple containing a name and child module  Example::      >>> # xdoctest: +SKIP("undefined vars")     >>> for name, module in model.named_children():     >>>     if name in ['conv4', 'conv5']:     >>>         print(module)
 
-    
+
 ???- note "named_modules"
 
     Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
@@ -600,10 +600,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`    
-    - **prefix**     (*str*)     – defaults to ``    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`
+    - **prefix**     (*str*)     – defaults to ``
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_parameters"
 
     Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
@@ -612,10 +612,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "old_fit"
 
     Deprecated training method from before Sentence Transformers v3.0, it is recommended to use :class:`sentence_transformers.trainer.SentenceTransformerTrainer` instead. This method should only be used if you encounter issues with your existing training scripts after upgrading to v3.0+.
@@ -624,26 +624,26 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **train_objectives**     (*'Iterable[tuple[DataLoader, nn.Module]]'*)    
-    - **evaluator**     (*'SentenceEvaluator'*)     – defaults to `None`    
-    - **epochs**     (*'int'*)     – defaults to `1`    
-    - **steps_per_epoch**     – defaults to `None`    
-    - **scheduler**     (*'str'*)     – defaults to `WarmupLinear`    
-    - **warmup_steps**     (*'int'*)     – defaults to `10000`    
-    - **optimizer_class**     (*'type[Optimizer]'*)     – defaults to `<class 'torch.optim.adamw.AdamW'>`    
-    - **optimizer_params**     (*'dict[str, object]'*)     – defaults to `{'lr': 2e-05}`    
-    - **weight_decay**     (*'float'*)     – defaults to `0.01`    
-    - **evaluation_steps**     (*'int'*)     – defaults to `0`    
-    - **output_path**     (*'str'*)     – defaults to `None`    
-    - **save_best_model**     (*'bool'*)     – defaults to `True`    
-    - **max_grad_norm**     (*'float'*)     – defaults to `1`    
-    - **use_amp**     (*'bool'*)     – defaults to `False`    
-    - **callback**     (*'Callable[[float, int, int], None]'*)     – defaults to `None`    
-    - **show_progress_bar**     (*'bool'*)     – defaults to `True`    
-    - **checkpoint_path**     (*'str'*)     – defaults to `None`    
-    - **checkpoint_save_steps**     (*'int'*)     – defaults to `500`    
-    - **checkpoint_save_total_limit**     (*'int'*)     – defaults to `0`    
-    
+    - **train_objectives**     (*'Iterable[tuple[DataLoader, nn.Module]]'*)
+    - **evaluator**     (*'SentenceEvaluator'*)     – defaults to `None`
+    - **epochs**     (*'int'*)     – defaults to `1`
+    - **steps_per_epoch**     – defaults to `None`
+    - **scheduler**     (*'str'*)     – defaults to `WarmupLinear`
+    - **warmup_steps**     (*'int'*)     – defaults to `10000`
+    - **optimizer_class**     (*'type[Optimizer]'*)     – defaults to `<class 'torch.optim.adamw.AdamW'>`
+    - **optimizer_params**     (*'dict[str, object]'*)     – defaults to `{'lr': 2e-05}`
+    - **weight_decay**     (*'float'*)     – defaults to `0.01`
+    - **evaluation_steps**     (*'int'*)     – defaults to `0`
+    - **output_path**     (*'str'*)     – defaults to `None`
+    - **save_best_model**     (*'bool'*)     – defaults to `True`
+    - **max_grad_norm**     (*'float'*)     – defaults to `1`
+    - **use_amp**     (*'bool'*)     – defaults to `False`
+    - **callback**     (*'Callable[[float, int, int], None]'*)     – defaults to `None`
+    - **show_progress_bar**     (*'bool'*)     – defaults to `True`
+    - **checkpoint_path**     (*'str'*)     – defaults to `None`
+    - **checkpoint_save_steps**     (*'int'*)     – defaults to `500`
+    - **checkpoint_save_total_limit**     (*'int'*)     – defaults to `0`
+
 ???- note "parameters"
 
     Return an iterator over module parameters.
@@ -652,22 +652,22 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "pool_embeddings_hierarchical"
 
     Pools the embeddings hierarchically by clustering and averaging them.
 
     **Parameters**
 
-    - **documents_embeddings**     (*list[torch.Tensor]*)    
-    - **pool_factor**     (*int*)     – defaults to `1`    
-    - **protected_tokens**     (*int*)     – defaults to `1`    
-    
+    - **documents_embeddings**     (*list[torch.Tensor]*)
+    - **pool_factor**     (*int*)     – defaults to `1`
+    - **protected_tokens**     (*int*)     – defaults to `1`
+
     **Returns**
 
     *list[torch.Tensor]*:     A list of pooled embeddings for each document.
-    
+
 ???- note "pop"
 
 ???- note "push_to_hub"
@@ -678,20 +678,20 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **repo_id**     (*'str'*)    
-    - **token**     (*'str | None'*)     – defaults to `None`    
+    - **repo_id**     (*'str'*)
+    - **token**     (*'str | None'*)     – defaults to `None`
         Hugging Face authentication token to download private models.
-    - **private**     (*'bool | None'*)     – defaults to `None`    
-    - **safe_serialization**     (*'bool'*)     – defaults to `True`    
-    - **commit_message**     (*'str | None'*)     – defaults to `None`    
-    - **local_model_path**     (*'str | None'*)     – defaults to `None`    
-    - **exist_ok**     (*'bool'*)     – defaults to `False`    
-    - **replace_model_card**     (*'bool'*)     – defaults to `False`    
-    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`    
-    - **revision**     (*'str | None'*)     – defaults to `None`    
+    - **private**     (*'bool | None'*)     – defaults to `None`
+    - **safe_serialization**     (*'bool'*)     – defaults to `True`
+    - **commit_message**     (*'str | None'*)     – defaults to `None`
+    - **local_model_path**     (*'str | None'*)     – defaults to `None`
+    - **exist_ok**     (*'bool'*)     – defaults to `False`
+    - **replace_model_card**     (*'bool'*)     – defaults to `False`
+    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`
+    - **revision**     (*'str | None'*)     – defaults to `None`
         The specific model version to use. It can be a branch name, a tag name, or a commit id, for a stored model on Hugging Face.
-    - **create_pr**     (*'bool'*)     – defaults to `False`    
-    
+    - **create_pr**     (*'bool'*)     – defaults to `False`
+
 ???- note "register_backward_hook"
 
     Register a backward hook on the module.
@@ -700,8 +700,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+
 ???- note "register_buffer"
 
     Add a buffer to the module.
@@ -710,10 +710,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **tensor**     (*Optional[torch.Tensor]*)    
-    - **persistent**     (*bool*)     – defaults to `True`    
-    
+    - **name**     (*str*)
+    - **tensor**     (*Optional[torch.Tensor]*)
+    - **persistent**     (*bool*)     – defaults to `True`
+
 ???- note "register_forward_hook"
 
     Register a forward hook on the module.
@@ -722,11 +722,11 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    - **always_call**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+    - **always_call**     (*bool*)     – defaults to `False`
+
 ???- note "register_forward_pre_hook"
 
     Register a forward pre-hook on the module.
@@ -735,10 +735,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_hook"
 
     Register a backward hook on the module.
@@ -747,9 +747,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_pre_hook"
 
     Register a backward pre-hook on the module.
@@ -758,9 +758,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_load_state_dict_post_hook"
 
     Register a post hook to be run after module's ``load_state_dict`` is called.
@@ -769,17 +769,17 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "register_module"
 
     Alias for :func:`add_module`.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "register_parameter"
 
     Add a parameter to the module.
@@ -788,9 +788,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **param**     (*Optional[torch.nn.parameter.Parameter]*)    
-    
+    - **name**     (*str*)
+    - **param**     (*Optional[torch.nn.parameter.Parameter]*)
+
 ???- note "register_state_dict_pre_hook"
 
     Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
@@ -799,8 +799,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "requires_grad_"
 
     Change if autograd should record operations on parameters in this module.
@@ -809,8 +809,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **requires_grad**     (*bool*)     – defaults to `True`    
-    
+    - **requires_grad**     (*bool*)     – defaults to `True`
+
 ???- note "save"
 
     Saves a model and its configuration files to a directory, so that it can be loaded with ``SentenceTransformer(path)`` again.
@@ -819,12 +819,12 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **path**     (*str*)    
-    - **model_name**     (*str | None*)     – defaults to `None`    
-    - **create_model_card**     (*bool*)     – defaults to `True`    
-    - **train_datasets**     (*list[str] | None*)     – defaults to `None`    
-    - **safe_serialization**     (*bool*)     – defaults to `True`    
-    
+    - **path**     (*str*)
+    - **model_name**     (*str | None*)     – defaults to `None`
+    - **create_model_card**     (*bool*)     – defaults to `True`
+    - **train_datasets**     (*list[str] | None*)     – defaults to `None`
+    - **safe_serialization**     (*bool*)     – defaults to `True`
+
 ???- note "save_pretrained"
 
     Saves a model and its configuration files to a directory, so that it can be loaded with ``SentenceTransformer(path)`` again.
@@ -833,12 +833,12 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **path**     (*'str'*)    
-    - **model_name**     (*'str | None'*)     – defaults to `None`    
-    - **create_model_card**     (*'bool'*)     – defaults to `True`    
-    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`    
-    - **safe_serialization**     (*'bool'*)     – defaults to `True`    
-    
+    - **path**     (*'str'*)
+    - **model_name**     (*'str | None'*)     – defaults to `None`
+    - **create_model_card**     (*'bool'*)     – defaults to `True`
+    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`
+    - **safe_serialization**     (*'bool'*)     – defaults to `True`
+
 ???- note "save_to_hub"
 
     DEPRECATED, use `push_to_hub` instead.
@@ -847,18 +847,18 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **repo_id**     (*'str'*)    
-    - **organization**     (*'str | None'*)     – defaults to `None`    
-    - **token**     (*'str | None'*)     – defaults to `None`    
+    - **repo_id**     (*'str'*)
+    - **organization**     (*'str | None'*)     – defaults to `None`
+    - **token**     (*'str | None'*)     – defaults to `None`
         Hugging Face authentication token to download private models.
-    - **private**     (*'bool | None'*)     – defaults to `None`    
-    - **safe_serialization**     (*'bool'*)     – defaults to `True`    
-    - **commit_message**     (*'str'*)     – defaults to `Add new SentenceTransformer model.`    
-    - **local_model_path**     (*'str | None'*)     – defaults to `None`    
-    - **exist_ok**     (*'bool'*)     – defaults to `False`    
-    - **replace_model_card**     (*'bool'*)     – defaults to `False`    
-    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`    
-    
+    - **private**     (*'bool | None'*)     – defaults to `None`
+    - **safe_serialization**     (*'bool'*)     – defaults to `True`
+    - **commit_message**     (*'str'*)     – defaults to `Add new SentenceTransformer model.`
+    - **local_model_path**     (*'str | None'*)     – defaults to `None`
+    - **exist_ok**     (*'bool'*)     – defaults to `False`
+    - **replace_model_card**     (*'bool'*)     – defaults to `False`
+    - **train_datasets**     (*'list[str] | None'*)     – defaults to `None`
+
 ???- note "set_adapter"
 
     Sets a specific adapter by forcing the model to use a that adapter and disable the other adapters.
@@ -867,9 +867,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "set_extra_state"
 
     Set extra state contained in the loaded `state_dict`.
@@ -878,8 +878,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **state**     (*Any*)    
-    
+    - **state**     (*Any*)
+
 ???- note "set_pooling_include_prompt"
 
     Sets the `include_prompt` attribute in the pooling layer in the model, if there is one.
@@ -888,22 +888,22 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **include_prompt**     (*'bool'*)    
-    
+    - **include_prompt**     (*'bool'*)
+
 ???- note "share_memory"
 
     See :meth:`torch.Tensor.share_memory_`.
 
-    
+
 ???- note "skiplist_mask"
 
     Create a mask for the set of input_ids that are in the skiplist.
 
     **Parameters**
 
-    - **input_ids**     (*torch.Tensor*)    
-    - **skiplist**     (*list[int]*)    
-    
+    - **input_ids**     (*torch.Tensor*)
+    - **skiplist**     (*list[int]*)
+
 ???- note "smart_batching_collate"
 
     Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model Here, batch is a list of InputExample instances: [InputExample(...), ...]
@@ -912,20 +912,20 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **batch**     (*'list[InputExample]'*)    
-    
+    - **batch**     (*'list[InputExample]'*)
+
 ???- note "start_multi_process_pool"
 
     Starts a multi-process pool to process the encoding with several independent processes. This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised to start only one process per GPU. This method works together with encode_multi_process and stop_multi_process_pool.
 
     **Parameters**
 
-    - **target_devices**     (*list[str]*)     – defaults to `None`    
-    
+    - **target_devices**     (*list[str]*)     – defaults to `None`
+
     **Returns**
 
     *dict*:     A dictionary with the target processes, an input queue, and an output queue.
-    
+
 ???- note "state_dict"
 
     Return a dictionary containing references to the whole state of the module.
@@ -934,19 +934,19 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **destination**     – defaults to `None`    
-    - **prefix**     – defaults to ``    
-    - **keep_vars**     – defaults to `False`    
-    
+    - **args**
+    - **destination**     – defaults to `None`
+    - **prefix**     – defaults to ``
+    - **keep_vars**     – defaults to `False`
+
 ???- note "stop_multi_process_pool"
 
     Stops all processes started with start_multi_process_pool.
 
     Args:     pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list.  Returns:     None
 
-    - **pool**     (*"dict[Literal['input', 'output', 'processes'], Any]"*)    
-    
+    - **pool**     (*"dict[Literal['input', 'output', 'processes'], Any]"*)
+
 ???- note "to"
 
     Move and/or cast the parameters and buffers.
@@ -955,9 +955,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "to_empty"
 
     Move the parameters and buffers to the specified device without copying storage.
@@ -966,10 +966,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **device**     (*Union[int, str, torch.device, NoneType]*)    
+    - **device**     (*Union[int, str, torch.device, NoneType]*)
         Device (like "cuda", "cpu", "mps", "npu") that should be used for computation. If None, checks if a GPU can be used.
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "tokenize"
 
     Tokenizes the input texts.
@@ -978,10 +978,10 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **texts**     (*list[str] | list[dict] | list[tuple[str, str]]*)    
-    - **is_query**     (*bool*)     – defaults to `True`    
-    - **pad_document**     (*bool*)     – defaults to `False`    
-    
+    - **texts**     (*list[str] | list[dict] | list[tuple[str, str]]*)
+    - **is_query**     (*bool*)     – defaults to `True`
+    - **pad_document**     (*bool*)     – defaults to `False`
+
 ???- note "train"
 
     Set the module in training mode.
@@ -990,8 +990,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **mode**     (*bool*)     – defaults to `True`    
-    
+    - **mode**     (*bool*)     – defaults to `True`
+
 ???- note "truncate_sentence_embeddings"
 
     In this context, :meth:`SentenceTransformer.encode <sentence_transformers.SentenceTransformer.encode>` outputs sentence embeddings truncated at dimension ``truncate_dim``.
@@ -1000,9 +1000,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **truncate_dim**     (*'int | None'*)    
+    - **truncate_dim**     (*'int | None'*)
         The dimension to truncate sentence embeddings to. `None` does no truncation. Truncation is only applicable during inference when :meth:`SentenceTransformer.encode` is called.
-    
+
 ???- note "type"
 
     Casts all parameters and buffers to :attr:`dst_type`.
@@ -1011,8 +1011,8 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **dst_type**     (*Union[torch.dtype, str]*)    
-    
+    - **dst_type**     (*Union[torch.dtype, str]*)
+
 ???- note "xpu"
 
     Move all model parameters and buffers to the XPU.
@@ -1021,9 +1021,9 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
         Device (like "cuda", "cpu", "mps", "npu") that should be used for computation. If None, checks if a GPU can be used.
-    
+
 ???- note "zero_grad"
 
     Reset gradients of all model parameters.
@@ -1032,5 +1032,4 @@ Loads or creates a ColBERT model that can be used to map sentences / text to mul
 
     **Parameters**
 
-    - **set_to_none**     (*bool*)     – defaults to `True`    
-    
+    - **set_to_none**     (*bool*)     – defaults to `True`
diff --git a/docs/api/models/Dense.md b/docs/api/models/Dense.md
index 90598a9e..43fbd350 100644
--- a/docs/api/models/Dense.md
+++ b/docs/api/models/Dense.md
@@ -58,9 +58,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "add_module"
 
     Add a child module to the current module.
@@ -69,9 +69,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "apply"
 
     Apply ``fn`` recursively to every submodule (as returned by ``.children()``) as well as self.
@@ -80,15 +80,15 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)    
-    
+    - **fn**     (*Callable[[ForwardRef('Module')], NoneType]*)
+
 ???- note "bfloat16"
 
     Casts all floating point parameters and buffers to ``bfloat16`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "buffers"
 
     Return an iterator over module buffers.
@@ -97,15 +97,15 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "children"
 
     Return an iterator over immediate children modules.
 
     Yields:     Module: a child module
 
-    
+
 ???- note "compile"
 
     Compile this Module's forward using :func:`torch.compile`.
@@ -114,16 +114,16 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "cpu"
 
     Move all model parameters and buffers to the CPU.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "cuda"
 
     Move all model parameters and buffers to the GPU.
@@ -132,63 +132,63 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "double"
 
     Casts all floating point parameters and buffers to ``double`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "eval"
 
     Set the module in evaluation mode.
 
     This has any effect only on certain modules. See documentations of particular modules for details of their behaviors in training/evaluation mode, if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.  This is equivalent with :meth:`self.train(False) <torch.nn.Module.train>`.  See :ref:`locally-disable-grad-doc` for a comparison between `.eval()` and several similar mechanisms that may be confused with it.  Returns:     Module: self
 
-    
+
 ???- note "extra_repr"
 
     Set the extra representation of the module.
 
     To print customized extra information, you should re-implement this method in your own modules. Both single-line and multi-line strings are acceptable.
 
-    
+
 ???- note "float"
 
     Casts all floating point parameters and buffers to ``float`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "forward"
 
     Performs linear projection on the token embeddings.
 
     **Parameters**
 
-    - **features**     (*dict[str, torch.Tensor]*)    
-    
+    - **features**     (*dict[str, torch.Tensor]*)
+
 ???- note "from_sentence_transformers"
 
     Converts a SentenceTransformer Dense model to a Dense model. Our Dense model does not have the activation function.
 
-    - **dense**     (*sentence_transformers.models.Dense.Dense*)    
-    
+    - **dense**     (*sentence_transformers.models.Dense.Dense*)
+
 ???- note "from_stanford_weights"
 
     Load the weight of the Dense layer using weights from a stanford-nlp checkpoint.
 
     **Parameters**
 
-    - **model_name_or_path**     (*str | os.PathLike*)    
-    - **cache_folder**     (*str | os.PathLike | None*)     – defaults to `None`    
-    - **revision**     (*str | None*)     – defaults to `None`    
-    - **local_files_only**     (*bool | None*)     – defaults to `None`    
-    - **token**     (*str | bool | None*)     – defaults to `None`    
-    - **use_auth_token**     (*str | bool | None*)     – defaults to `None`    
-    
+    - **model_name_or_path**     (*str | os.PathLike*)
+    - **cache_folder**     (*str | os.PathLike | None*)     – defaults to `None`
+    - **revision**     (*str | None*)     – defaults to `None`
+    - **local_files_only**     (*bool | None*)     – defaults to `None`
+    - **token**     (*str | bool | None*)     – defaults to `None`
+    - **use_auth_token**     (*str | bool | None*)     – defaults to `None`
+
 ???- note "get_buffer"
 
     Return the buffer given by ``target`` if it exists, otherwise throw an error.
@@ -197,8 +197,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_config_dict"
 
 ???- note "get_extra_state"
@@ -207,7 +207,7 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     Implement this and a corresponding :func:`set_extra_state` for your module if you need to store extra state. This function is called when building the module's `state_dict()`.  Note that extra state should be picklable to ensure working serialization of the state_dict. We only provide provide backwards compatibility guarantees for serializing Tensors; other objects may break backwards compatibility if their serialized pickled form changes.  Returns:     object: Any extra state to store in the module's state_dict
 
-    
+
 ???- note "get_parameter"
 
     Return the parameter given by ``target`` if it exists, otherwise throw an error.
@@ -216,8 +216,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "get_sentence_embedding_dimension"
 
 ???- note "get_submodule"
@@ -228,15 +228,15 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **target**     (*str*)    
-    
+    - **target**     (*str*)
+
 ???- note "half"
 
     Casts all floating point parameters and buffers to ``half`` datatype.
 
     .. note::     This method modifies the module in-place.  Returns:     Module: self
 
-    
+
 ???- note "ipu"
 
     Move all model parameters and buffers to the IPU.
@@ -245,14 +245,14 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "load"
 
     Load a Dense layer.
 
-    - **input_path**    
-    
+    - **input_path**
+
 ???- note "load_state_dict"
 
     Copy parameters and buffers from :attr:`state_dict` into this module and its descendants.
@@ -261,17 +261,17 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **state_dict**     (*Mapping[str, Any]*)    
-    - **strict**     (*bool*)     – defaults to `True`    
-    - **assign**     (*bool*)     – defaults to `False`    
-    
+    - **state_dict**     (*Mapping[str, Any]*)
+    - **strict**     (*bool*)     – defaults to `True`
+    - **assign**     (*bool*)     – defaults to `False`
+
 ???- note "modules"
 
     Return an iterator over all modules in the network.
 
     Yields:     Module: a module in the network  Note:     Duplicate modules are returned only once. In the following     example, ``l`` will be returned only once.  Example::      >>> l = nn.Linear(2, 2)     >>> net = nn.Sequential(l, l)     >>> for idx, m in enumerate(net.modules()):     ...     print(idx, '->', m)      0 -> Sequential(       (0): Linear(in_features=2, out_features=2, bias=True)       (1): Linear(in_features=2, out_features=2, bias=True)     )     1 -> Linear(in_features=2, out_features=2, bias=True)
 
-    
+
 ???- note "named_buffers"
 
     Return an iterator over module buffers, yielding both the name of the buffer as well as the buffer itself.
@@ -280,17 +280,17 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_children"
 
     Return an iterator over immediate children modules, yielding both the name of the module as well as the module itself.
 
     Yields:     (str, Module): Tuple containing a name and child module  Example::      >>> # xdoctest: +SKIP("undefined vars")     >>> for name, module in model.named_children():     >>>     if name in ['conv4', 'conv5']:     >>>         print(module)
 
-    
+
 ???- note "named_modules"
 
     Return an iterator over all modules in the network, yielding both the name of the module as well as the module itself.
@@ -299,10 +299,10 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`    
-    - **prefix**     (*str*)     – defaults to ``    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **memo**     (*Optional[Set[ForwardRef('Module')]]*)     – defaults to `None`
+    - **prefix**     (*str*)     – defaults to ``
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "named_parameters"
 
     Return an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
@@ -311,10 +311,10 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **prefix**     (*str*)     – defaults to ``    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    - **remove_duplicate**     (*bool*)     – defaults to `True`    
-    
+    - **prefix**     (*str*)     – defaults to ``
+    - **recurse**     (*bool*)     – defaults to `True`
+    - **remove_duplicate**     (*bool*)     – defaults to `True`
+
 ???- note "parameters"
 
     Return an iterator over module parameters.
@@ -323,8 +323,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "register_backward_hook"
 
     Register a backward hook on the module.
@@ -333,8 +333,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+
 ???- note "register_buffer"
 
     Add a buffer to the module.
@@ -343,10 +343,10 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **tensor**     (*Optional[torch.Tensor]*)    
-    - **persistent**     (*bool*)     – defaults to `True`    
-    
+    - **name**     (*str*)
+    - **tensor**     (*Optional[torch.Tensor]*)
+    - **persistent**     (*bool*)     – defaults to `True`
+
 ???- note "register_forward_hook"
 
     Register a forward hook on the module.
@@ -355,11 +355,11 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    - **always_call**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...], Any], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any], Any], Optional[Any]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+    - **always_call**     (*bool*)     – defaults to `False`
+
 ???- note "register_forward_pre_hook"
 
     Register a forward pre-hook on the module.
@@ -368,10 +368,10 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    - **with_kwargs**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Union[Callable[[~T, Tuple[Any, ...]], Optional[Any]], Callable[[~T, Tuple[Any, ...], Dict[str, Any]], Optional[Tuple[Any, Dict[str, Any]]]]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+    - **with_kwargs**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_hook"
 
     Register a backward hook on the module.
@@ -380,9 +380,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor], Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_full_backward_pre_hook"
 
     Register a backward pre-hook on the module.
@@ -391,9 +391,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)    
-    - **prepend**     (*bool*)     – defaults to `False`    
-    
+    - **hook**     (*Callable[[ForwardRef('Module'), Union[Tuple[torch.Tensor, ...], torch.Tensor]], Union[NoneType, Tuple[torch.Tensor, ...], torch.Tensor]]*)
+    - **prepend**     (*bool*)     – defaults to `False`
+
 ???- note "register_load_state_dict_post_hook"
 
     Register a post hook to be run after module's ``load_state_dict`` is called.
@@ -402,17 +402,17 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "register_module"
 
     Alias for :func:`add_module`.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **module**     (*Optional[ForwardRef('Module')]*)    
-    
+    - **name**     (*str*)
+    - **module**     (*Optional[ForwardRef('Module')]*)
+
 ???- note "register_parameter"
 
     Add a parameter to the module.
@@ -421,9 +421,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **name**     (*str*)    
-    - **param**     (*Optional[torch.nn.parameter.Parameter]*)    
-    
+    - **name**     (*str*)
+    - **param**     (*Optional[torch.nn.parameter.Parameter]*)
+
 ???- note "register_state_dict_pre_hook"
 
     Register a pre-hook for the :meth:`~torch.nn.Module.state_dict` method.
@@ -432,8 +432,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **hook**    
-    
+    - **hook**
+
 ???- note "requires_grad_"
 
     Change if autograd should record operations on parameters in this module.
@@ -442,8 +442,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **requires_grad**     (*bool*)     – defaults to `True`    
-    
+    - **requires_grad**     (*bool*)     – defaults to `True`
+
 ???- note "save"
 
 ???- note "set_extra_state"
@@ -454,13 +454,13 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **state**     (*Any*)    
-    
+    - **state**     (*Any*)
+
 ???- note "share_memory"
 
     See :meth:`torch.Tensor.share_memory_`.
 
-    
+
 ???- note "state_dict"
 
     Return a dictionary containing references to the whole state of the module.
@@ -469,11 +469,11 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **args**    
-    - **destination**     – defaults to `None`    
-    - **prefix**     – defaults to ``    
-    - **keep_vars**     – defaults to `False`    
-    
+    - **args**
+    - **destination**     – defaults to `None`
+    - **prefix**     – defaults to ``
+    - **keep_vars**     – defaults to `False`
+
 ???- note "to"
 
     Move and/or cast the parameters and buffers.
@@ -482,9 +482,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **args**    
-    - **kwargs**    
-    
+    - **args**
+    - **kwargs**
+
 ???- note "to_empty"
 
     Move the parameters and buffers to the specified device without copying storage.
@@ -493,9 +493,9 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **device**     (*Union[int, str, torch.device, NoneType]*)    
-    - **recurse**     (*bool*)     – defaults to `True`    
-    
+    - **device**     (*Union[int, str, torch.device, NoneType]*)
+    - **recurse**     (*bool*)     – defaults to `True`
+
 ???- note "train"
 
     Set the module in training mode.
@@ -504,8 +504,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **mode**     (*bool*)     – defaults to `True`    
-    
+    - **mode**     (*bool*)     – defaults to `True`
+
 ???- note "type"
 
     Casts all parameters and buffers to :attr:`dst_type`.
@@ -514,8 +514,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **dst_type**     (*Union[torch.dtype, str]*)    
-    
+    - **dst_type**     (*Union[torch.dtype, str]*)
+
 ???- note "xpu"
 
     Move all model parameters and buffers to the XPU.
@@ -524,8 +524,8 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`    
-    
+    - **device**     (*Union[int, torch.device, NoneType]*)     – defaults to `None`
+
 ???- note "zero_grad"
 
     Reset gradients of all model parameters.
@@ -534,5 +534,4 @@ Performs linear projection on the token embeddings to a lower dimension.
 
     **Parameters**
 
-    - **set_to_none**     (*bool*)     – defaults to `True`    
-    
+    - **set_to_none**     (*bool*)     – defaults to `True`
diff --git a/docs/api/retrieve/ColBERT.md b/docs/api/retrieve/ColBERT.md
index 09e0d22c..ba75b65f 100644
--- a/docs/api/retrieve/ColBERT.md
+++ b/docs/api/retrieve/ColBERT.md
@@ -86,9 +86,8 @@ ColBERT retriever.
 
     **Parameters**
 
-    - **queries_embeddings**     (*list[list | numpy.ndarray | torch.Tensor]*)    
-    - **k**     (*int*)     – defaults to `10`    
-    - **k_token**     (*int*)     – defaults to `100`    
-    - **device**     (*str | None*)     – defaults to `None`    
-    - **batch_size**     (*int*)     – defaults to `50`    
-    
+    - **queries_embeddings**     (*list[list | numpy.ndarray | torch.Tensor]*)
+    - **k**     (*int*)     – defaults to `10`
+    - **k_token**     (*int*)     – defaults to `100`
+    - **device**     (*str | None*)     – defaults to `None`
+    - **batch_size**     (*int*)     – defaults to `50`
diff --git a/docs/api/utils/ColBERTCollator.md b/docs/api/utils/ColBERTCollator.md
index e771922c..03a4d3aa 100644
--- a/docs/api/utils/ColBERTCollator.md
+++ b/docs/api/utils/ColBERTCollator.md
@@ -66,5 +66,4 @@ Collator for ColBERT model.
 
     **Parameters**
 
-    - **features**     (*list[dict]*)    
-    
+    - **features**     (*list[dict]*)
diff --git a/docs/api/utils/KDProcessing.md b/docs/api/utils/KDProcessing.md
index 366ed177..e444336f 100644
--- a/docs/api/utils/KDProcessing.md
+++ b/docs/api/utils/KDProcessing.md
@@ -68,13 +68,12 @@ Dataset processing class for knowledge distillation training.
 
     **Parameters**
 
-    - **example**     (*dict*)    
-    
+    - **example**     (*dict*)
+
 ???- note "transform"
 
     Update the input dataset with the queries and documents.
 
     **Parameters**
 
-    - **examples**     (*dict*)    
-    
+    - **examples**     (*dict*)
diff --git a/pylate/hf_hub/model_card.py b/pylate/hf_hub/model_card.py
index 8649c6ae..6cdd02c0 100644
--- a/pylate/hf_hub/model_card.py
+++ b/pylate/hf_hub/model_card.py
@@ -9,7 +9,6 @@
 
 import torch
 import transformers
-from huggingface_hub import ModelCard
 from sentence_transformers import SentenceTransformerModelCardData
 from sentence_transformers import __version__ as sentence_transformers_version
 from sentence_transformers.util import (
@@ -130,6 +129,9 @@ class PylateModelCardData(SentenceTransformerModelCardData):
     pipeline_tag: str = field(default="sentence-similarity", init=False)
     library_name: str = field(default="PyLate", init=False)
     version: dict[str, str] = field(default_factory=get_versions, init=False)
+    template_path: Path = field(
+        default=Path(__file__).parent / "model_card_template.md", init=False
+    )
 
     # Passed via `register_model` only
     model: SentenceTransformer | None = field(default=None, init=False, repr=False)
@@ -249,11 +251,3 @@ def set_widget_examples(self, dataset) -> None:
             The dataset to create widget examples from.
         """
         pass
-
-
-def generate_model_card(model: SentenceTransformer) -> str:
-    template_path = Path(__file__).parent / "model_card_template.md"
-    model_card = ModelCard.from_template(
-        card_data=model.model_card_data, template_path=template_path, hf_emoji="🐕"
-    )
-    return model_card.content
diff --git a/pylate/losses/cached_contrastive.py b/pylate/losses/cached_contrastive.py
index 5cefc535..ed56eaa2 100644
--- a/pylate/losses/cached_contrastive.py
+++ b/pylate/losses/cached_contrastive.py
@@ -26,7 +26,12 @@ class RandContext:
 
     def __init__(self, *tensors) -> None:
         self.fwd_cpu_state = torch.get_rng_state()
-        self.fwd_gpu_devices, self.fwd_gpu_states = get_device_states(*tensors)
+        if torch.backends.mps.is_available():
+            raise RuntimeError(
+                "MPS backend is not supported for this operation. Please use CPU or CUDA."
+            )
+        else:
+            self.fwd_gpu_devices, self.fwd_gpu_states = get_device_states(*tensors)
 
     def __enter__(self) -> None:
         self._fork = torch.random.fork_rng(devices=self.fwd_gpu_devices, enabled=True)
diff --git a/pylate/models/colbert.py b/pylate/models/colbert.py
index bed86157..0e9ee9ca 100644
--- a/pylate/models/colbert.py
+++ b/pylate/models/colbert.py
@@ -6,8 +6,6 @@
 import math
 import os
 import string
-import traceback
-from pathlib import Path
 from typing import Iterable, Literal, Optional
 
 import numpy as np
@@ -23,7 +21,7 @@
 from tqdm.autonotebook import trange
 from transformers.utils import cached_file
 
-from ..hf_hub.model_card import PylateModelCardData, generate_model_card
+from ..hf_hub.model_card import PylateModelCardData
 from ..scores import SimilarityFunction
 from ..utils import _start_multi_process_pool
 from .Dense import Dense
@@ -281,7 +279,7 @@ def __init__(
                     with open(metadata, "r") as f:
                         metadata = json.load(f)
                         # If the user do not override the values, read from config file
-                        meta_query_token_id =  metadata.get("query_token_id", None)
+                        meta_query_token_id = metadata.get("query_token_id", None)
                         if self.query_prefix is None and meta_query_token_id:
                             self.query_prefix = meta_query_token_id
 
@@ -289,19 +287,23 @@ def __init__(
                         if self.document_prefix is None and meta_doc_token_id:
                             self.document_prefix = meta_doc_token_id
 
-                        meta_query_maxlen = metadata.get("query_maxlen", None)    
+                        meta_query_maxlen = metadata.get("query_maxlen", None)
                         if self.query_length is None and meta_query_maxlen:
                             self.query_length = meta_query_maxlen
-                
 
                         meta_doc_maxlen = metadata.get("doc_maxlen", None)
                         if self.document_length is None and meta_doc_maxlen:
                             self.document_length = meta_doc_maxlen
 
-                        meta_attend_to_mask_tokens = metadata.get("attend_to_mask_tokens", None)
-                        if self.attend_to_expansion_tokens is None and meta_attend_to_mask_tokens:
+                        meta_attend_to_mask_tokens = metadata.get(
+                            "attend_to_mask_tokens", None
+                        )
+                        if (
+                            self.attend_to_expansion_tokens is None
+                            and meta_attend_to_mask_tokens
+                        ):
                             self.attend_to_expansion_tokens = meta_attend_to_mask_tokens
-                            
+
                     logger.info("Loaded the configuration from Stanford NLP model.")
                 except EnvironmentError:
                     if self.query_prefix is None:
@@ -1236,41 +1238,3 @@ def _load_sbert_model(
             if isinstance(module, Transformer)
             or isinstance(module, DenseSentenceTransformer)
         ], module_kwargs
-    
-    def _create_model_card(
-        self, path: str, model_name: str | None = None, train_datasets: list[str] | None = "deprecated"
-    ) -> None:
-        """
-        Create an automatic model and stores it in the specified path. If no training was done and the loaded model
-        was a Sentence Transformer model already, then its model card is reused.
-
-        Args:
-            path (str): The path where the model card will be stored.
-            model_name (Optional[str], optional): The name of the model. Defaults to None.
-            train_datasets (Optional[List[str]], optional): Deprecated argument. Defaults to "deprecated".
-
-        Returns:
-            None
-        """
-        if model_name:
-            model_path = Path(model_name)
-            if not model_path.exists() and not self.model_card_data.model_id:
-                self.model_card_data.model_id = model_name
-
-        # If we loaded a Sentence Transformer model from the Hub, and no training was done, then
-        # we don't generate a new model card, but reuse the old one instead.
-        if self._model_card_text and "generated_from_trainer" not in self.model_card_data.tags:
-            model_card = self._model_card_text
-        else:
-            try:
-                model_card = generate_model_card(self)
-            except Exception:
-                logger.error(
-                    f"Error while generating model card:\n{traceback.format_exc()}"
-                    "Consider opening an issue on https://github.com/UKPLab/sentence-transformers/issues with this traceback.\n"
-                    "Skipping model card creation."
-                )
-                return
-
-        with open(os.path.join(path, "README.md"), "w", encoding="utf8") as fOut:
-            fOut.write(model_card)
diff --git a/pylate/utils/distributed.py b/pylate/utils/distributed.py
index e4ecaf82..f724953e 100644
--- a/pylate/utils/distributed.py
+++ b/pylate/utils/distributed.py
@@ -53,8 +53,8 @@ def all_gather(tensor: torch.Tensor) -> Sequence[torch.Tensor]:
     # Warn once about uninitialized or single-GPU usage.
     if not _has_warned_dist_not_initialized:
         warning = """
-            Trying to gather while torch.distributed is not available or has not been initialized, 
-             returning the original (local) tensor. This is expected if you are 
+            Trying to gather while torch.distributed is not available or has not been initialized,
+             returning the original (local) tensor. This is expected if you are
              only using one GPU; consider not using gathering to remove this warning.
        """
         logger.warning(warning)
diff --git a/setup.py b/setup.py
index cd78bd37..95a783ec 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
     long_description = fh.read()
 
 base_packages = [
-    "sentence-transformers == 3.4.1",
+    "sentence-transformers == 4.0.2",
     "datasets >= 2.20.0",
     "accelerate >= 0.31.0",
     "voyager >= 2.0.9",
@@ -22,6 +22,7 @@
     "ruff >= 0.4.9",
     "pytest-cov >= 5.0.0",
     "pytest-xdist >=3.6.0",
+    "pytest-rerunfailures >= 15.0.0",
     "pytest >= 8.2.1",
     "pandas >= 2.2.1",
     "mkdocs-material == 9.5.32",
diff --git a/tests/test_contrastive.py b/tests/test_contrastive.py
index ca9f4b67..4d2fe381 100644
--- a/tests/test_contrastive.py
+++ b/tests/test_contrastive.py
@@ -6,6 +6,8 @@
 import shutil
 
 import pandas as pd
+import pytest
+import torch
 from datasets import load_dataset
 from sentence_transformers import (
     SentenceTransformerTrainer,
@@ -16,6 +18,7 @@
 from pylate import evaluation, losses, models, utils
 
 
+@pytest.mark.skipif(torch.backends.mps.is_available(), reason="MPS is not supported")
 def test_contrastive_training() -> None:
     """Test constrastive training."""
     if os.path.exists(path="tests/contrastive"):
diff --git a/tests/test_model.py b/tests/test_model.py
index 178e6845..76cd9d61 100644
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -2,11 +2,13 @@
 
 import math
 
+import pytest
 import torch
 
 from pylate import models, rank
 
 
+@pytest.mark.flaky(reruns=3, reruns_delay=5)
 def test_model_creation(**kwargs) -> None:
     """Test the creation of different models."""
     query = ["fruits are healthy."]
diff --git a/tests/test_model_loading.py b/tests/test_model_loading.py
index b4c5316c..c727d307 100644
--- a/tests/test_model_loading.py
+++ b/tests/test_model_loading.py
@@ -5,6 +5,7 @@
 from pylate import models
 
 
+@pytest.mark.flaky(reruns=3, reruns_delay=5)
 @pytest.mark.parametrize(
     "model_name_or_path, revision, query_prefix, document_prefix, max_seq_length, query_length, config",
     [
@@ -65,8 +66,8 @@
             "[unused1]",
             514,
             32,
-            {}, 
-        )
+            {},
+        ),
     ],
 )
 def test_load_model(